obimultiplex saves unassigned sequence

This commit is contained in:
2022-02-01 23:25:19 +01:00
parent e9cdfd7e03
commit 98a4363d22
6 changed files with 189 additions and 0 deletions

View File

@ -31,4 +31,5 @@ func main() {
sequences, _ := obiconvert.ReadBioSequencesBatch(args...) sequences, _ := obiconvert.ReadBioSequencesBatch(args...)
amplicons, _ := obimultiplex.IExtractBarcodeBatches(sequences) amplicons, _ := obimultiplex.IExtractBarcodeBatches(sequences)
obiconvert.WriteBioSequencesBatch(amplicons, true) obiconvert.WriteBioSequencesBatch(amplicons, true)
amplicons.Wait()
} }

View File

@ -80,6 +80,10 @@ func WriteSequenceBatch(iterator obiseq.IBioSequenceBatch,
return newIter, err return newIter, err
} }
if iterator.Finished() {
return iterator, nil
}
return obiseq.NilIBioSequenceBatch, fmt.Errorf("input iterator not ready") return obiseq.NilIBioSequenceBatch, fmt.Errorf("input iterator not ready")
} }

View File

@ -372,3 +372,70 @@ func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, sizes ...i
return newIter return newIter
} }
func (iterator IBioSequenceBatch) DivideOn(predicate SequencePredicate,
size int, sizes ...int) (IBioSequenceBatch, IBioSequenceBatch) {
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
buffsize = sizes[1]
}
trueIter := MakeIBioSequenceBatch(buffsize)
falseIter := MakeIBioSequenceBatch(buffsize)
trueIter.Add(1)
falseIter.Add(1)
go func() {
trueIter.Wait()
falseIter.Wait()
close(trueIter.Channel())
close(falseIter.Channel())
}()
go func() {
trueOrder := 0
falseOrder := 0
iterator = iterator.SortBatches()
trueSlice := make(BioSequenceSlice, 0, size)
falseSlice := make(BioSequenceSlice, 0, size)
for iterator.Next() {
seqs := iterator.Get()
for _, s := range seqs.slice {
if predicate(s) {
trueSlice = append(trueSlice, s)
} else {
falseSlice = append(falseSlice, s)
}
if len(trueSlice) == size {
trueIter.Channel() <- MakeBioSequenceBatch(trueOrder, trueSlice...)
trueOrder++
trueSlice = make(BioSequenceSlice, 0, size)
}
if len(falseSlice) == size {
falseIter.Channel() <- MakeBioSequenceBatch(falseOrder, falseSlice...)
falseOrder++
falseSlice = make(BioSequenceSlice, 0, size)
}
}
}
if len(trueSlice) > 0 {
trueIter.Channel() <- MakeBioSequenceBatch(trueOrder, trueSlice...)
}
if len(falseSlice) > 0 {
falseIter.Channel() <- MakeBioSequenceBatch(falseOrder, falseSlice...)
}
trueIter.Done()
falseIter.Done()
}()
return trueIter, falseIter
}

77
pkg/obiseq/predicate.go Normal file
View File

@ -0,0 +1,77 @@
package obiseq
type SequencePredicate func(BioSequence) bool
func (predicate1 SequencePredicate) And(predicate2 SequencePredicate) SequencePredicate {
f := func(sequence BioSequence) bool {
return predicate1(sequence) && predicate2(sequence)
}
return f
}
func (predicate1 SequencePredicate) Or(predicate2 SequencePredicate) SequencePredicate {
f := func(sequence BioSequence) bool {
return predicate1(sequence) || predicate2(sequence)
}
return f
}
func (predicate1 SequencePredicate) Xor(predicate2 SequencePredicate) SequencePredicate {
f := func(sequence BioSequence) bool {
p1 := predicate1(sequence)
p2 := predicate2(sequence)
return (p1 && !p2) || (p2 && !p1)
}
return f
}
func (predicate1 SequencePredicate) Not() SequencePredicate {
f := func(sequence BioSequence) bool {
return !predicate1(sequence)
}
return f
}
func HasAttribute(name string) SequencePredicate {
f := func(sequence BioSequence) bool {
if sequence.HasAnnotation() {
_, ok := (sequence.Annotations())[name]
return ok
}
return false
}
return f
}
func MoreAbundantThan(count int) SequencePredicate {
f := func(sequence BioSequence) bool {
return sequence.Count() > count
}
return f
}
func IsLongerOrEqualTo(length int) SequencePredicate {
f := func(sequence BioSequence) bool {
return sequence.Length() >= length
}
return f
}
func IsShorterOrEqualTo(length int) SequencePredicate {
f := func(sequence BioSequence) bool {
return sequence.Length() <= length
}
return f
}

View File

@ -1,5 +1,11 @@
package obitax package obitax
import (
"log"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
)
func (taxon *TaxNode) IsSubCladeOf(parent *TaxNode) bool { func (taxon *TaxNode) IsSubCladeOf(parent *TaxNode) bool {
for taxon.taxid != parent.taxid && taxon.parent != taxon.taxid { for taxon.taxid != parent.taxid && taxon.parent != taxon.taxid {
@ -19,3 +25,18 @@ func (taxon *TaxNode) IsBelongingSubclades(clades *TaxonSet) bool {
return ok return ok
} }
func IsSubCladeOf(taxonomy Taxonomy, taxid int) obiseq.SequencePredicate {
parent, err := taxonomy.Taxon(taxid)
if err != nil {
log.Fatalf("Cannot find taxon : %d (%v)", taxid, err)
}
f := func(sequence obiseq.BioSequence) bool {
taxon, err := taxonomy.Taxon(sequence.Taxid())
return err == nil && taxon.IsSubCladeOf(parent)
}
return f
}

View File

@ -6,6 +6,7 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obingslibrary" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obingslibrary"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
) )
func IExtractBarcodeBatches(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequenceBatch, error) { func IExtractBarcodeBatches(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSequenceBatch, error) {
@ -31,9 +32,27 @@ func IExtractBarcodeBatches(iterator obiseq.IBioSequenceBatch) (obiseq.IBioSeque
newIter := iterator.MakeISliceWorker(worker) newIter := iterator.MakeISliceWorker(worker)
if !CLIConservedErrors() { if !CLIConservedErrors() {
log.Println("Discards unassigned sequences")
newIter = newIter.Rebatch(obioptions.CLIBatchSize()) newIter = newIter.Rebatch(obioptions.CLIBatchSize())
} }
var unidentified obiseq.IBioSequenceBatch
if CLIUnidentifiedFileName() != "" {
log.Printf("Unassigned sequences saved in file: %s\n", CLIUnidentifiedFileName())
unidentified, newIter = newIter.DivideOn(obiseq.HasAttribute("demultiplex_error"),
obioptions.CLIBatchSize())
go func() {
_, err := obiconvert.WriteBioSequencesBatch(unidentified,
true,
CLIUnidentifiedFileName())
if err != nil {
log.Fatalf("%v", err)
}
}()
}
log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers()) log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers())
return newIter, nil return newIter, nil