mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 00:30:27 +00:00
Merge pull request #64 from metabarcoding/push-yurwulsmpxkq
End of obilowmask
This commit is contained in:
@@ -8,7 +8,7 @@ import (
|
|||||||
// corresponds to the last commit, and not the one when the file will be
|
// corresponds to the last commit, and not the one when the file will be
|
||||||
// commited
|
// commited
|
||||||
|
|
||||||
var _Commit = "961abce"
|
var _Commit = "c1b9503"
|
||||||
var _Version = "Release 4.4.0"
|
var _Version = "Release 4.4.0"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package obilowmask
|
package obilowmask
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"math"
|
"math"
|
||||||
|
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
@@ -17,6 +18,7 @@ type MaskingMode int
|
|||||||
const (
|
const (
|
||||||
Mask MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters
|
Mask MaskingMode = iota // Mask mode: replace low-complexity regions with masked characters
|
||||||
Split // Split mode: split sequence into high-complexity fragments
|
Split // Split mode: split sequence into high-complexity fragments
|
||||||
|
Extract
|
||||||
)
|
)
|
||||||
|
|
||||||
// LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
|
// LowMaskWorker creates a worker to mask low-complexity regions in DNA sequences.
|
||||||
@@ -342,6 +344,76 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
return obiseq.BioSequenceSlice{seqCopy}, nil
|
return obiseq.BioSequenceSlice{seqCopy}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
selectMasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
|
||||||
|
rep := obiseq.NewBioSequenceSlice()
|
||||||
|
|
||||||
|
inlow := false
|
||||||
|
fromlow := -1
|
||||||
|
for i, masked := range maskPosition {
|
||||||
|
if masked && !inlow {
|
||||||
|
fromlow = i
|
||||||
|
inlow = true
|
||||||
|
}
|
||||||
|
if inlow && !masked {
|
||||||
|
if fromlow >= 0 {
|
||||||
|
frg, err := sequence.Subsequence(fromlow, i, false)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rep.Push(frg)
|
||||||
|
}
|
||||||
|
inlow = false
|
||||||
|
fromlow = -1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle the case where we end in a masked region
|
||||||
|
if inlow && fromlow >= 0 {
|
||||||
|
frg, err := sequence.Subsequence(fromlow, len(maskPosition), false)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rep.Push(frg)
|
||||||
|
}
|
||||||
|
|
||||||
|
return *rep, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
selectunmasked := func(sequence *obiseq.BioSequence, maskPosition []bool) (obiseq.BioSequenceSlice, error) {
|
||||||
|
rep := obiseq.NewBioSequenceSlice()
|
||||||
|
|
||||||
|
inhigh := false
|
||||||
|
fromhigh := -1
|
||||||
|
for i, masked := range maskPosition {
|
||||||
|
if !masked && !inhigh {
|
||||||
|
fromhigh = i
|
||||||
|
inhigh = true
|
||||||
|
}
|
||||||
|
if inhigh && masked {
|
||||||
|
if fromhigh >= 0 {
|
||||||
|
frg, err := sequence.Subsequence(fromhigh, i, false)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rep.Push(frg)
|
||||||
|
}
|
||||||
|
inhigh = false
|
||||||
|
fromhigh = -1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle the case where we end in an unmasked region
|
||||||
|
if inhigh && fromhigh >= 0 {
|
||||||
|
frg, err := sequence.Subsequence(fromhigh, len(maskPosition), false)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
rep.Push(frg)
|
||||||
|
}
|
||||||
|
|
||||||
|
return *rep, nil
|
||||||
|
}
|
||||||
|
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
// FUNCTION 7: masking - Main masking function
|
// FUNCTION 7: masking - Main masking function
|
||||||
// ========================================================================
|
// ========================================================================
|
||||||
@@ -425,7 +497,15 @@ func LowMaskWorker(kmer_size int, level_max int, threshold float64, mode Masking
|
|||||||
sequence.SetAttribute("mask", mask)
|
sequence.SetAttribute("mask", mask)
|
||||||
sequence.SetAttribute("Entropies", entropies)
|
sequence.SetAttribute("Entropies", entropies)
|
||||||
|
|
||||||
return applyMaskMode(sequence, remove, maskChar)
|
switch mode {
|
||||||
|
case Mask:
|
||||||
|
return applyMaskMode(sequence, remove, maskChar)
|
||||||
|
case Split:
|
||||||
|
return selectunmasked(sequence, remove)
|
||||||
|
case Extract:
|
||||||
|
return selectMasked(sequence, remove)
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("Unknown mode %d", mode)
|
||||||
}
|
}
|
||||||
|
|
||||||
return masking
|
return masking
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ var __kmer_size__ = 31
|
|||||||
var __level_max__ = 6
|
var __level_max__ = 6
|
||||||
var __threshold__ = 0.5
|
var __threshold__ = 0.5
|
||||||
var __split_mode__ = false
|
var __split_mode__ = false
|
||||||
|
var __low_mode__ = false
|
||||||
var __mask__ = "."
|
var __mask__ = "."
|
||||||
|
|
||||||
func LowMaskOptionSet(options *getoptions.GetOpt) {
|
func LowMaskOptionSet(options *getoptions.GetOpt) {
|
||||||
@@ -29,11 +30,15 @@ func LowMaskOptionSet(options *getoptions.GetOpt) {
|
|||||||
options.Description("entropy theshold used to mask a kmer"),
|
options.Description("entropy theshold used to mask a kmer"),
|
||||||
)
|
)
|
||||||
|
|
||||||
options.BoolVar(&__split_mode__, "--split-mode", __split_mode__,
|
options.BoolVar(&__split_mode__, "split-mode", __split_mode__,
|
||||||
options.Description("in split mode, input sequences are splitted to remove masked regions"),
|
options.Description("in split mode, input sequences are splitted to remove masked regions"),
|
||||||
)
|
)
|
||||||
|
|
||||||
options.StringVar(&__mask__, "--masking-char", __mask__,
|
options.BoolVar(&__low_mode__, "low-mode", __low_mode__,
|
||||||
|
options.Description("in split mode, input sequences are splitted to remove masked regions"),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.StringVar(&__mask__, "masking-char", __mask__,
|
||||||
options.Description("Character used to mask low complexity region"),
|
options.Description("Character used to mask low complexity region"),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
@@ -56,9 +61,12 @@ func CLIThreshold() float64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func CLIMaskingMode() MaskingMode {
|
func CLIMaskingMode() MaskingMode {
|
||||||
if __split_mode__ {
|
switch {
|
||||||
|
case __low_mode__:
|
||||||
|
return Extract
|
||||||
|
case __split_mode__:
|
||||||
return Split
|
return Split
|
||||||
} else {
|
default:
|
||||||
return Mask
|
return Mask
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user