mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds the notioon of NA value
This commit is contained in:
@ -6,9 +6,9 @@ import (
|
|||||||
"strconv"
|
"strconv"
|
||||||
)
|
)
|
||||||
|
|
||||||
type SequenceClassifier func(sequence BioSequence) string
|
type BioSequenceClassifier func(sequence BioSequence) string
|
||||||
|
|
||||||
func AnnotationClassifier(key string) SequenceClassifier {
|
func AnnotationClassifier(key string, na string) BioSequenceClassifier {
|
||||||
f := func(sequence BioSequence) string {
|
f := func(sequence BioSequence) string {
|
||||||
if sequence.HasAnnotation() {
|
if sequence.HasAnnotation() {
|
||||||
value, ok := sequence.Annotations()[key]
|
value, ok := sequence.Annotations()[key]
|
||||||
@ -22,15 +22,13 @@ func AnnotationClassifier(key string) SequenceClassifier {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return ""
|
return na
|
||||||
}
|
}
|
||||||
|
|
||||||
return f
|
return f
|
||||||
}
|
}
|
||||||
|
|
||||||
var SampleClassifier = AnnotationClassifier("sample")
|
func PredicateClassifier(predicate SequencePredicate) BioSequenceClassifier {
|
||||||
|
|
||||||
func PredicateClassifier(predicate SequencePredicate) SequenceClassifier {
|
|
||||||
f := func(sequence BioSequence) string {
|
f := func(sequence BioSequence) string {
|
||||||
if predicate(sequence) {
|
if predicate(sequence) {
|
||||||
return "true"
|
return "true"
|
||||||
@ -44,7 +42,7 @@ func PredicateClassifier(predicate SequencePredicate) SequenceClassifier {
|
|||||||
|
|
||||||
// Builds a classifier function based on CRC32 of the sequence
|
// Builds a classifier function based on CRC32 of the sequence
|
||||||
//
|
//
|
||||||
func HashClassifier(size int) SequenceClassifier {
|
func HashClassifier(size int) BioSequenceClassifier {
|
||||||
f := func(sequence BioSequence) string {
|
f := func(sequence BioSequence) string {
|
||||||
h := crc32.ChecksumIEEE(sequence.Sequence()) % uint32(size)
|
h := crc32.ChecksumIEEE(sequence.Sequence()) % uint32(size)
|
||||||
return strconv.Itoa(int(h))
|
return strconv.Itoa(int(h))
|
||||||
@ -53,7 +51,17 @@ func HashClassifier(size int) SequenceClassifier {
|
|||||||
return f
|
return f
|
||||||
}
|
}
|
||||||
|
|
||||||
func RotateClassifier(size int) SequenceClassifier {
|
// Builds a classifier function based on the sequence
|
||||||
|
//
|
||||||
|
func SequenceClassifier() BioSequenceClassifier {
|
||||||
|
f := func(sequence BioSequence) string {
|
||||||
|
return sequence.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
return f
|
||||||
|
}
|
||||||
|
|
||||||
|
func RotateClassifier(size int) BioSequenceClassifier {
|
||||||
n := 0
|
n := 0
|
||||||
f := func(sequence BioSequence) string {
|
f := func(sequence BioSequence) string {
|
||||||
h := n % size
|
h := n % size
|
||||||
|
@ -27,7 +27,7 @@ func (dist *IDistribute) News() chan string {
|
|||||||
return dist.news
|
return dist.news
|
||||||
}
|
}
|
||||||
|
|
||||||
func (iterator IBioSequenceBatch) Distribute(class SequenceClassifier, sizes ...int) IDistribute {
|
func (iterator IBioSequenceBatch) Distribute(class BioSequenceClassifier, sizes ...int) IDistribute {
|
||||||
batchsize := 5000
|
batchsize := 5000
|
||||||
buffsize := 2
|
buffsize := 2
|
||||||
|
|
||||||
|
@ -20,7 +20,7 @@ func (sequence BioSequence) HasStatsOn(key string) bool {
|
|||||||
return ok
|
return ok
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sequence BioSequence) StatsOn(key string) StatsOnValues {
|
func (sequence BioSequence) StatsOn(key string, na string) StatsOnValues {
|
||||||
mkey := "merged_" + key
|
mkey := "merged_" + key
|
||||||
annotations := sequence.Annotations()
|
annotations := sequence.Annotations()
|
||||||
istat, ok := annotations[mkey]
|
istat, ok := annotations[mkey]
|
||||||
@ -44,20 +44,22 @@ func (sequence BioSequence) StatsOn(key string) StatsOnValues {
|
|||||||
newstat = true
|
newstat = true
|
||||||
}
|
}
|
||||||
|
|
||||||
if newstat && sequence.StatsPlusOne(key, sequence) {
|
if newstat && sequence.StatsPlusOne(key, sequence, na) {
|
||||||
delete(sequence.Annotations(), key)
|
delete(sequence.Annotations(), key)
|
||||||
}
|
}
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence) bool {
|
func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence, na string) bool {
|
||||||
|
sval := na
|
||||||
|
stats := sequence.StatsOn(key,na)
|
||||||
|
retval := false
|
||||||
|
|
||||||
if toAdd.HasAnnotation() {
|
if toAdd.HasAnnotation() {
|
||||||
stats := sequence.StatsOn(key)
|
|
||||||
value, ok := toAdd.Annotations()[key]
|
value, ok := toAdd.Annotations()[key]
|
||||||
|
|
||||||
if ok {
|
if ok {
|
||||||
var sval string
|
|
||||||
|
|
||||||
switch value := value.(type) {
|
switch value := value.(type) {
|
||||||
case string:
|
case string:
|
||||||
@ -69,17 +71,18 @@ func (sequence BioSequence) StatsPlusOne(key string, toAdd BioSequence) bool {
|
|||||||
default:
|
default:
|
||||||
log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v)", value)
|
log.Fatalf("Trying to make stats on a none string, integer or boolean value (%v)", value)
|
||||||
}
|
}
|
||||||
old, ok := stats[sval]
|
retval = true
|
||||||
if !ok {
|
|
||||||
old = 0
|
|
||||||
}
|
|
||||||
stats[sval] = old + 1
|
|
||||||
|
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return false
|
old, ok := stats[sval]
|
||||||
|
if !ok {
|
||||||
|
old = 0
|
||||||
|
}
|
||||||
|
stats[sval] = old + 1
|
||||||
|
|
||||||
|
return retval
|
||||||
}
|
}
|
||||||
|
|
||||||
func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
|
func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
|
||||||
@ -94,7 +97,7 @@ func (stats StatsOnValues) Merge(toMerged StatsOnValues) StatsOnValues {
|
|||||||
return stats
|
return stats
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...string) BioSequence {
|
func (sequence BioSequence) Merge(tomerge BioSequence, na string, inplace bool, statsOn ...string) BioSequence {
|
||||||
if !inplace {
|
if !inplace {
|
||||||
sequence = sequence.Copy()
|
sequence = sequence.Copy()
|
||||||
}
|
}
|
||||||
@ -105,15 +108,15 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str
|
|||||||
|
|
||||||
annotation := sequence.Annotations()
|
annotation := sequence.Annotations()
|
||||||
|
|
||||||
count := tomerge.Count() + sequence.Count()
|
count := sequence.Count() + tomerge.Count()
|
||||||
|
|
||||||
for _, key := range keys {
|
for _, key := range statsOn {
|
||||||
if tomerge.HasStatsOn(key) {
|
if tomerge.HasStatsOn(key) {
|
||||||
smk := sequence.StatsOn(key)
|
smk := sequence.StatsOn(key,na)
|
||||||
mmk := tomerge.StatsOn(key)
|
mmk := tomerge.StatsOn(key,na)
|
||||||
smk.Merge(mmk)
|
smk.Merge(mmk)
|
||||||
} else {
|
} else {
|
||||||
sequence.StatsPlusOne(key, tomerge)
|
sequence.StatsPlusOne(key, tomerge,na)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -140,89 +143,23 @@ func (sequence BioSequence) Merge(tomerge BioSequence, inplace bool, keys ...str
|
|||||||
return sequence
|
return sequence
|
||||||
}
|
}
|
||||||
|
|
||||||
func (sequences BioSequenceSlice) Unique(statsOn []string, keys ...string) BioSequenceSlice {
|
func (sequences BioSequenceSlice) Merge(na string, statsOn ...string) BioSequenceSlice {
|
||||||
uniq := make(map[string]*BioSequenceSlice, len(sequences))
|
seq := sequences[0]
|
||||||
nVariant := 0
|
seq.SetQualities(nil)
|
||||||
|
seq.Annotations()["count"] = 1
|
||||||
for _, seq := range sequences {
|
|
||||||
|
|
||||||
sstring := seq.String()
|
|
||||||
pgroup, ok := uniq[sstring]
|
|
||||||
|
|
||||||
if !ok {
|
|
||||||
group := make(BioSequenceSlice, 0, 10)
|
|
||||||
pgroup = &group
|
|
||||||
uniq[sstring] = pgroup
|
|
||||||
}
|
|
||||||
|
|
||||||
ok = false
|
|
||||||
i := 0
|
|
||||||
var s BioSequence
|
|
||||||
|
|
||||||
for i, s = range *pgroup {
|
|
||||||
ok = true
|
|
||||||
switch {
|
|
||||||
case seq.HasAnnotation() && s.HasAnnotation():
|
|
||||||
for _, k := range keys {
|
|
||||||
seqV, seqOk := seq.Annotations()[k]
|
|
||||||
sV, sOk := s.Annotations()[k]
|
|
||||||
|
|
||||||
ok = ok && ((!seqOk && !sOk) || ((seqOk && sOk) && (seqV == sV)))
|
|
||||||
|
|
||||||
if !ok {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case seq.HasAnnotation() && !s.HasAnnotation():
|
|
||||||
for _, k := range keys {
|
|
||||||
_, seqOk := seq.Annotations()[k]
|
|
||||||
ok = ok && !seqOk
|
|
||||||
if !ok {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
case !seq.HasAnnotation() && s.HasAnnotation():
|
|
||||||
for _, k := range keys {
|
|
||||||
_, sOk := s.Annotations()[k]
|
|
||||||
ok = ok && !sOk
|
|
||||||
if !ok {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
ok = true
|
|
||||||
}
|
|
||||||
|
|
||||||
if ok {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ok {
|
|
||||||
(*pgroup)[i] = s.Merge(seq, true, statsOn...)
|
|
||||||
} else {
|
|
||||||
seq.SetQualities(nil)
|
|
||||||
if seq.Count() == 1 {
|
|
||||||
seq.Annotations()["count"] = 1
|
|
||||||
}
|
|
||||||
*pgroup = append(*pgroup, seq)
|
|
||||||
nVariant++
|
|
||||||
}
|
|
||||||
|
|
||||||
|
for _, toMerge := range sequences[1:] {
|
||||||
|
seq.Merge(toMerge, na, true, statsOn...)
|
||||||
|
toMerge.Recycle()
|
||||||
}
|
}
|
||||||
|
|
||||||
output := make(BioSequenceSlice, 0, nVariant)
|
return sequences[0:1]
|
||||||
for _, seqs := range uniq {
|
|
||||||
output = append(output, *seqs...)
|
|
||||||
}
|
|
||||||
|
|
||||||
return output
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func UniqueSliceWorker(statsOn []string, keys ...string) SeqSliceWorker {
|
func MergeSliceWorker(na string, statsOn ...string) SeqSliceWorker {
|
||||||
|
|
||||||
worker := func(sequences BioSequenceSlice) BioSequenceSlice {
|
worker := func(sequences BioSequenceSlice) BioSequenceSlice {
|
||||||
return sequences.Unique(statsOn, keys...)
|
return sequences.Merge(na, statsOn...)
|
||||||
}
|
}
|
||||||
|
|
||||||
return worker
|
return worker
|
||||||
|
@ -14,6 +14,7 @@ var _FilenamePattern = ""
|
|||||||
var _SequenceClassifierTag = ""
|
var _SequenceClassifierTag = ""
|
||||||
var _BatchCount = 0
|
var _BatchCount = 0
|
||||||
var _HashSize = 0
|
var _HashSize = 0
|
||||||
|
var _NAValue = "NA"
|
||||||
|
|
||||||
func DistributeOptionSet(options *getoptions.GetOpt) {
|
func DistributeOptionSet(options *getoptions.GetOpt) {
|
||||||
options.StringVar(&_FilenamePattern, "pattern", _FilenamePattern,
|
options.StringVar(&_FilenamePattern, "pattern", _FilenamePattern,
|
||||||
@ -29,6 +30,9 @@ func DistributeOptionSet(options *getoptions.GetOpt) {
|
|||||||
"The name must corresponds to a string, a integer or a boolean value. "+
|
"The name must corresponds to a string, a integer or a boolean value. "+
|
||||||
"That value will be used to dispatch sequences amoong the different files"))
|
"That value will be used to dispatch sequences amoong the different files"))
|
||||||
|
|
||||||
|
options.StringVar(&_NAValue, "na-value", _NAValue,
|
||||||
|
options.Description("Value used when the classifier tag is not defined for a sequence."))
|
||||||
|
|
||||||
options.IntVar(&_BatchCount, "batches", 0,
|
options.IntVar(&_BatchCount, "batches", 0,
|
||||||
options.Alias("n"),
|
options.Alias("n"),
|
||||||
options.Description("Indicates in how many batches the input file must bee splitted."))
|
options.Description("Indicates in how many batches the input file must bee splitted."))
|
||||||
@ -44,10 +48,10 @@ func OptionSet(options *getoptions.GetOpt) {
|
|||||||
DistributeOptionSet(options)
|
DistributeOptionSet(options)
|
||||||
}
|
}
|
||||||
|
|
||||||
func CLISequenceClassifier() obiseq.SequenceClassifier {
|
func CLISequenceClassifier() obiseq.BioSequenceClassifier {
|
||||||
switch {
|
switch {
|
||||||
case _SequenceClassifierTag != "":
|
case _SequenceClassifierTag != "":
|
||||||
return obiseq.AnnotationClassifier(_SequenceClassifierTag)
|
return obiseq.AnnotationClassifier(_SequenceClassifierTag, _NAValue)
|
||||||
case _BatchCount > 0:
|
case _BatchCount > 0:
|
||||||
return obiseq.RotateClassifier(_BatchCount)
|
return obiseq.RotateClassifier(_BatchCount)
|
||||||
case _HashSize > 0:
|
case _HashSize > 0:
|
||||||
@ -66,3 +70,7 @@ func CLIFileNamePattern() string {
|
|||||||
|
|
||||||
return _FilenamePattern
|
return _FilenamePattern
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CLINAValue() string {
|
||||||
|
return _NAValue
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user