mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Add option to obimicrosat to control microsat length and orientation
This commit is contained in:
@ -10,7 +10,33 @@ import (
|
|||||||
"github.com/dlclark/regexp2"
|
"github.com/dlclark/regexp2"
|
||||||
)
|
)
|
||||||
|
|
||||||
func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
|
// MakeMicrosatWorker creates a SeqWorker that finds microsatellite regions in a BioSequence.
|
||||||
|
//
|
||||||
|
// The function takes three integer parameters: minLength, maxLength, and minUnits. minLength specifies the
|
||||||
|
// minimum length of the microsatellite region, maxLength specifies the maximum length, and minUnits specifies
|
||||||
|
// the minimum number of repeating units. The function returns an obiseq.SeqWorker, which is a Go function that
|
||||||
|
// takes a BioSequence as input and returns a BioSequenceSlice and an error. The SeqWorker performs the following
|
||||||
|
// steps:
|
||||||
|
// 1. It defines two helper functions: min_unit and normalizedUnit.
|
||||||
|
// 2. It defines a regular expression pattern based on the input parameters.
|
||||||
|
// 3. It defines the SeqWorker function w.
|
||||||
|
// 4. The w function searches for a match of the regular expression pattern in the sequence string.
|
||||||
|
// 5. If no match is found, it returns an empty BioSequenceSlice and nil error.
|
||||||
|
// 6. It calculates the length of the matching unit.
|
||||||
|
// 7. It checks if the unit length is less than minLength.
|
||||||
|
// 8. It creates a new regular expression pattern based on the unit length.
|
||||||
|
// 9. It extracts the matching unit from the sequence string.
|
||||||
|
// 10. It sets various attributes on the sequence.
|
||||||
|
// 11. It returns a BioSequenceSlice containing the original sequence and nil error.
|
||||||
|
//
|
||||||
|
// Parameters:
|
||||||
|
// - minLength: the minimum length of the microsatellite region.
|
||||||
|
// - maxLength: the maximum length of the microsatellite region.
|
||||||
|
// - minUnits: the minimum number of repeating units.
|
||||||
|
//
|
||||||
|
// Return type:
|
||||||
|
// - obiseq.SeqWorker: a Go function that takes a BioSequence as input and returns a BioSequenceSlice and an error.
|
||||||
|
func MakeMicrosatWorker(minUnitLength, maxUnitLength, minUnits, minLength, minflankLength int, reoriented bool) obiseq.SeqWorker {
|
||||||
|
|
||||||
min_unit := func(microsat string) int {
|
min_unit := func(microsat string) int {
|
||||||
for i := 1; i < len(microsat); i++ {
|
for i := 1; i < len(microsat); i++ {
|
||||||
@ -25,20 +51,30 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
normalizedUnit := func(unit string) string {
|
normalizedUnit := func(unit string) (string, bool) {
|
||||||
all := make([]string, 0, len(unit)*2)
|
all := make([]struct {
|
||||||
|
unit string
|
||||||
|
direct bool
|
||||||
|
}, 0, len(unit)*2)
|
||||||
|
|
||||||
for i := 0; i < len(unit); i++ {
|
for i := 0; i < len(unit); i++ {
|
||||||
rotate := unit[i:] + unit[:i]
|
rotate := unit[i:] + unit[:i]
|
||||||
revcomp_rotate := obiseq.NewBioSequence("", []byte(rotate), "").ReverseComplement(true).String()
|
revcomp_rotate := obiseq.NewBioSequence("", []byte(rotate), "").ReverseComplement(true).String()
|
||||||
all = append(all, rotate, revcomp_rotate)
|
all = append(all, struct {
|
||||||
|
unit string
|
||||||
|
direct bool
|
||||||
|
}{unit: rotate, direct: true},
|
||||||
|
struct {
|
||||||
|
unit string
|
||||||
|
direct bool
|
||||||
|
}{unit: revcomp_rotate, direct: false})
|
||||||
}
|
}
|
||||||
|
|
||||||
sort.Slice(all, func(i, j int) bool {
|
sort.Slice(all, func(i, j int) bool {
|
||||||
return all[i] < all[j]
|
return all[i].unit < all[j].unit
|
||||||
})
|
})
|
||||||
|
|
||||||
return all[0]
|
return all[0].unit, all[0].direct
|
||||||
}
|
}
|
||||||
|
|
||||||
build_regexp := func(minLength, maxLength, minUnits int) *regexp2.Regexp {
|
build_regexp := func(minLength, maxLength, minUnits int) *regexp2.Regexp {
|
||||||
@ -51,7 +87,7 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
|
|||||||
regexp2.RE2)
|
regexp2.RE2)
|
||||||
}
|
}
|
||||||
|
|
||||||
regexp := build_regexp(minLength, maxLength, minUnits)
|
regexp := build_regexp(minUnitLength, maxUnitLength, minUnits)
|
||||||
|
|
||||||
w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||||
|
|
||||||
@ -63,7 +99,7 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
|
|||||||
|
|
||||||
unit_length := min_unit(match.String())
|
unit_length := min_unit(match.String())
|
||||||
|
|
||||||
if unit_length < minLength {
|
if unit_length < minUnitLength {
|
||||||
return obiseq.BioSequenceSlice{}, nil
|
return obiseq.BioSequenceSlice{}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -71,20 +107,46 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
|
|||||||
|
|
||||||
match, _ = pattern.FindStringMatch(sequence.String())
|
match, _ = pattern.FindStringMatch(sequence.String())
|
||||||
|
|
||||||
|
if match.Length < minLength {
|
||||||
|
return obiseq.BioSequenceSlice{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
unit := match.String()[0:unit_length]
|
unit := match.String()[0:unit_length]
|
||||||
|
normalized, direct := normalizedUnit(unit)
|
||||||
|
matchFrom := match.Index
|
||||||
|
|
||||||
|
if !direct && reoriented {
|
||||||
|
sequence = sequence.ReverseComplement(true)
|
||||||
|
sequence.SetId(sequence.Id() + "_cmp")
|
||||||
|
matchFrom = sequence.Len() - match.Index - match.Length
|
||||||
|
}
|
||||||
|
|
||||||
|
matchTo := matchFrom + match.Length
|
||||||
|
|
||||||
|
microsat := sequence.String()[matchFrom:matchTo]
|
||||||
|
unit = microsat[0:unit_length]
|
||||||
|
|
||||||
|
left := sequence.String()[0:matchFrom]
|
||||||
|
right := sequence.String()[matchTo:]
|
||||||
|
|
||||||
|
if len(left) < minflankLength || len(right) < minflankLength {
|
||||||
|
return obiseq.BioSequenceSlice{}, nil
|
||||||
|
}
|
||||||
|
|
||||||
sequence.SetAttribute("microsat_unit_length", unit_length)
|
sequence.SetAttribute("microsat_unit_length", unit_length)
|
||||||
sequence.SetAttribute("microsat_unit_count", match.Length/unit_length)
|
sequence.SetAttribute("microsat_unit_count", match.Length/unit_length)
|
||||||
sequence.SetAttribute("seq_length", sequence.Len())
|
sequence.SetAttribute("seq_length", sequence.Len())
|
||||||
sequence.SetAttribute("microsat", match.String())
|
sequence.SetAttribute("microsat", microsat)
|
||||||
sequence.SetAttribute("microsat_from", match.Index)
|
sequence.SetAttribute("microsat_from", matchFrom+1)
|
||||||
sequence.SetAttribute("microsat_to", match.Index+match.Length-1)
|
sequence.SetAttribute("microsat_to", matchTo)
|
||||||
|
|
||||||
sequence.SetAttribute("microsat_unit", unit)
|
sequence.SetAttribute("microsat_unit", unit)
|
||||||
sequence.SetAttribute("microsat_unit_normalized", normalizedUnit(unit))
|
sequence.SetAttribute("microsat_unit_normalized", normalized)
|
||||||
|
|
||||||
sequence.SetAttribute("microsat_left", sequence.String()[0:match.Index])
|
sequence.SetAttribute("microsat_unit_orientation", map[bool]string{true: "direct", false: "reverse"}[direct])
|
||||||
sequence.SetAttribute("microsat_right", sequence.String()[match.Index+match.Length:])
|
|
||||||
|
sequence.SetAttribute("microsat_left", left)
|
||||||
|
sequence.SetAttribute("microsat_right", right)
|
||||||
|
|
||||||
return obiseq.BioSequenceSlice{sequence}, nil
|
return obiseq.BioSequenceSlice{sequence}, nil
|
||||||
}
|
}
|
||||||
@ -92,10 +154,19 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
|
|||||||
return obiseq.SeqWorker(w)
|
return obiseq.SeqWorker(w)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// CLIAnnotateMicrosat is a function that annotates microsatellites in a given sequence.
|
||||||
|
//
|
||||||
|
// It takes an iterator of type `obiiter.IBioSequence` as a parameter.
|
||||||
|
// The function returns an iterator of type `obiiter.IBioSequence`.
|
||||||
func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
var newIter obiiter.IBioSequence
|
var newIter obiiter.IBioSequence
|
||||||
|
|
||||||
worker := MakeMicrosatWorker(CLIMinUnitLength(), CLIMaxUnitLength(), CLIMinUnitCount())
|
worker := MakeMicrosatWorker(CLIMinUnitLength(),
|
||||||
|
CLIMaxUnitLength(),
|
||||||
|
CLIMinUnitCount(),
|
||||||
|
CLIMinLength(),
|
||||||
|
CLIMinFlankLength(),
|
||||||
|
CLIReoriented())
|
||||||
|
|
||||||
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
|
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
|
||||||
|
|
||||||
|
@ -10,6 +10,9 @@ import (
|
|||||||
var _MinUnitLength = 1
|
var _MinUnitLength = 1
|
||||||
var _MaxUnitLength = 6
|
var _MaxUnitLength = 6
|
||||||
var _MinUnitCount = 5
|
var _MinUnitCount = 5
|
||||||
|
var _MinLength = 20
|
||||||
|
var _MinFlankLength = 0
|
||||||
|
var _NotReoriented = false
|
||||||
|
|
||||||
// PCROptionSet defines every options related to a simulated PCR.
|
// PCROptionSet defines every options related to a simulated PCR.
|
||||||
//
|
//
|
||||||
@ -31,6 +34,18 @@ func MicroSatelliteOptionSet(options *getoptions.GetOpt) {
|
|||||||
|
|
||||||
options.IntVar(&_MinUnitCount, "min-unit-count", _MinUnitCount,
|
options.IntVar(&_MinUnitCount, "min-unit-count", _MinUnitCount,
|
||||||
options.Description("Minumum number of repeated units."))
|
options.Description("Minumum number of repeated units."))
|
||||||
|
|
||||||
|
options.IntVar(&_MinLength, "min-length", _MinLength,
|
||||||
|
options.Alias("l"),
|
||||||
|
options.Description("Minimum length of a microsatellite."))
|
||||||
|
|
||||||
|
options.IntVar(&_MinFlankLength, "min-flank-length", _MinFlankLength,
|
||||||
|
options.Alias("f"),
|
||||||
|
options.Description("Minimum length of the flanking sequences."))
|
||||||
|
|
||||||
|
options.BoolVar(&_NotReoriented, "not-reoriented", _NotReoriented,
|
||||||
|
options.Alias("n"),
|
||||||
|
options.Description("Do not reorient the microsatellites."))
|
||||||
}
|
}
|
||||||
|
|
||||||
func OptionSet(options *getoptions.GetOpt) {
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
@ -53,3 +68,15 @@ func CLIMinUnitCount() int {
|
|||||||
func CLIMicroSatRegex() string {
|
func CLIMicroSatRegex() string {
|
||||||
return fmt.Sprintf("([acgt]{%d,%d})\\1{%d}", _MinUnitLength, _MaxUnitLength, _MinUnitCount-1)
|
return fmt.Sprintf("([acgt]{%d,%d})\\1{%d}", _MinUnitLength, _MaxUnitLength, _MinUnitCount-1)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CLIMinLength() int {
|
||||||
|
return _MinLength
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIMinFlankLength() int {
|
||||||
|
return _MinFlankLength
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIReoriented() bool {
|
||||||
|
return !_NotReoriented
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user