mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-10 09:40:27 +00:00
Adds the obimicrosat command
This commit is contained in:
104
pkg/obitools/obimicrosat/microsat.go
Normal file
104
pkg/obitools/obimicrosat/microsat.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package obimicrosat
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"github.com/dlclark/regexp2"
|
||||
)
|
||||
|
||||
func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
|
||||
|
||||
min_unit := func(microsat string) int {
|
||||
for i := 1; i < len(microsat); i++ {
|
||||
s1 := microsat[0 : len(microsat)-i]
|
||||
s2 := microsat[i:]
|
||||
|
||||
if s1 == s2 {
|
||||
return i
|
||||
}
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
normalizedUnit := func(unit string) string {
|
||||
all := make([]string, 0, len(unit)*2)
|
||||
|
||||
for i := 0; i < len(unit); i++ {
|
||||
rotate := unit[i:] + unit[:i]
|
||||
revcomp_rotate := obiseq.NewBioSequence("", []byte(rotate), "").ReverseComplement(true).String()
|
||||
all = append(all, rotate, revcomp_rotate)
|
||||
}
|
||||
|
||||
sort.Slice(all, func(i, j int) bool {
|
||||
return all[i] < all[j]
|
||||
})
|
||||
|
||||
return all[0]
|
||||
}
|
||||
|
||||
build_regexp := func(minLength, maxLength, minUnits int) *regexp2.Regexp {
|
||||
return regexp2.MustCompile(
|
||||
fmt.Sprintf("([acgt]{%d,%d})\\1{%d,}",
|
||||
minLength,
|
||||
maxLength,
|
||||
minUnits-1,
|
||||
),
|
||||
regexp2.RE2)
|
||||
}
|
||||
|
||||
regexp := build_regexp(minLength, maxLength, minUnits)
|
||||
|
||||
w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
match, _ := regexp.FindStringMatch(sequence.String())
|
||||
|
||||
if match == nil {
|
||||
return obiseq.BioSequenceSlice{}, nil
|
||||
}
|
||||
|
||||
unit_length := min_unit(match.String())
|
||||
|
||||
if unit_length < minLength {
|
||||
return obiseq.BioSequenceSlice{}, nil
|
||||
}
|
||||
|
||||
pattern := build_regexp(unit_length, unit_length, minUnits)
|
||||
|
||||
match, _ = pattern.FindStringMatch(sequence.String())
|
||||
|
||||
unit := match.String()[0:unit_length]
|
||||
|
||||
sequence.SetAttribute("microsat_unit_length", unit_length)
|
||||
sequence.SetAttribute("microsat_unit_count", match.Length/unit_length)
|
||||
sequence.SetAttribute("seq_length", sequence.Len())
|
||||
sequence.SetAttribute("microsat", match.String())
|
||||
sequence.SetAttribute("microsat_from", match.Index)
|
||||
sequence.SetAttribute("microsat_to", match.Index+match.Length-1)
|
||||
|
||||
sequence.SetAttribute("microsat_unit", unit)
|
||||
sequence.SetAttribute("microsat_unit_normalized", normalizedUnit(unit))
|
||||
|
||||
sequence.SetAttribute("microsat_left", sequence.String()[0:match.Index])
|
||||
sequence.SetAttribute("microsat_right", sequence.String()[match.Index+match.Length:])
|
||||
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
return obiseq.SeqWorker(w)
|
||||
}
|
||||
|
||||
func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
var newIter obiiter.IBioSequence
|
||||
|
||||
worker := MakeMicrosatWorker(CLIMinUnitLength(), CLIMaxUnitLength(), CLIMinUnitCount())
|
||||
|
||||
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
|
||||
|
||||
return newIter.FilterEmpty()
|
||||
|
||||
}
|
||||
55
pkg/obitools/obimicrosat/options.go
Normal file
55
pkg/obitools/obimicrosat/options.go
Normal file
@@ -0,0 +1,55 @@
|
||||
package obimicrosat
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _MinUnitLength = 1
|
||||
var _MaxUnitLength = 6
|
||||
var _MinUnitCount = 5
|
||||
|
||||
// PCROptionSet defines every options related to a simulated PCR.
|
||||
//
|
||||
// The function adds to a CLI every options proposed to the user
|
||||
// to tune the parametters of the PCR simulation algorithm.
|
||||
//
|
||||
// # Parameters
|
||||
//
|
||||
// - option : is a pointer to a getoptions.GetOpt instance normaly
|
||||
// produced by the
|
||||
func MicroSatelliteOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_MinUnitLength, "min-unit-length", _MinUnitLength,
|
||||
options.Alias("m"),
|
||||
options.Description("Minimum length of a microsatellite unit."))
|
||||
|
||||
options.IntVar(&_MaxUnitLength, "max-unit-length", _MaxUnitLength,
|
||||
options.Alias("M"),
|
||||
options.Description("Maximum length of a microsatellite unit."))
|
||||
|
||||
options.IntVar(&_MinUnitCount, "min-unit-count", _MinUnitCount,
|
||||
options.Description("Minumum number of repeated units."))
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
MicroSatelliteOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIMinUnitLength() int {
|
||||
return _MinUnitLength
|
||||
}
|
||||
|
||||
func CLIMaxUnitLength() int {
|
||||
return _MaxUnitLength
|
||||
}
|
||||
|
||||
func CLIMinUnitCount() int {
|
||||
return _MinUnitCount
|
||||
}
|
||||
|
||||
func CLIMicroSatRegex() string {
|
||||
return fmt.Sprintf("([acgt]{%d,%d})\\1{%d}", _MinUnitLength, _MaxUnitLength, _MinUnitCount-1)
|
||||
}
|
||||
@@ -132,7 +132,7 @@ func FindClosests(sequence *obiseq.BioSequence,
|
||||
lcs, alilength := -1, -1
|
||||
switch maxe {
|
||||
case 0:
|
||||
if obiutils.UnsafeStringFreomBytes(sequence.Sequence()) == obiutils.UnsafeStringFreomBytes(references[order].Sequence()) {
|
||||
if obiutils.UnsafeStringFromBytes(sequence.Sequence()) == obiutils.UnsafeStringFromBytes(references[order].Sequence()) {
|
||||
score = 0
|
||||
alilength = sequence.Len()
|
||||
lcs = alilength
|
||||
@@ -279,7 +279,7 @@ func Identify(sequence *obiseq.BioSequence,
|
||||
var bestmatch string
|
||||
var taxon *obitax.TaxNode
|
||||
|
||||
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFreomBytes(sequence.Sequence())]
|
||||
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFromBytes(sequence.Sequence())]
|
||||
if ok {
|
||||
taxon = exacttaxon.Taxon
|
||||
bestmatch = exacttaxon.Id
|
||||
@@ -399,7 +399,7 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
|
||||
|
||||
ft[len(ft)] = taxa[i]
|
||||
|
||||
seqstr := obiutils.UnsafeStringFreomBytes(seq.Sequence())
|
||||
seqstr := obiutils.UnsafeStringFromBytes(seq.Sequence())
|
||||
em, ok := exactmatch[seqstr]
|
||||
|
||||
if !ok {
|
||||
|
||||
Reference in New Issue
Block a user