Adds the obimicrosat command

This commit is contained in:
Eric Coissac
2024-08-05 15:31:20 +02:00
parent 3f57935328
commit bdb96dda94
11 changed files with 419 additions and 5 deletions

View File

@@ -0,0 +1,104 @@
package obimicrosat
import (
"fmt"
"sort"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/dlclark/regexp2"
)
func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
min_unit := func(microsat string) int {
for i := 1; i < len(microsat); i++ {
s1 := microsat[0 : len(microsat)-i]
s2 := microsat[i:]
if s1 == s2 {
return i
}
}
return 0
}
normalizedUnit := func(unit string) string {
all := make([]string, 0, len(unit)*2)
for i := 0; i < len(unit); i++ {
rotate := unit[i:] + unit[:i]
revcomp_rotate := obiseq.NewBioSequence("", []byte(rotate), "").ReverseComplement(true).String()
all = append(all, rotate, revcomp_rotate)
}
sort.Slice(all, func(i, j int) bool {
return all[i] < all[j]
})
return all[0]
}
build_regexp := func(minLength, maxLength, minUnits int) *regexp2.Regexp {
return regexp2.MustCompile(
fmt.Sprintf("([acgt]{%d,%d})\\1{%d,}",
minLength,
maxLength,
minUnits-1,
),
regexp2.RE2)
}
regexp := build_regexp(minLength, maxLength, minUnits)
w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
match, _ := regexp.FindStringMatch(sequence.String())
if match == nil {
return obiseq.BioSequenceSlice{}, nil
}
unit_length := min_unit(match.String())
if unit_length < minLength {
return obiseq.BioSequenceSlice{}, nil
}
pattern := build_regexp(unit_length, unit_length, minUnits)
match, _ = pattern.FindStringMatch(sequence.String())
unit := match.String()[0:unit_length]
sequence.SetAttribute("microsat_unit_length", unit_length)
sequence.SetAttribute("microsat_unit_count", match.Length/unit_length)
sequence.SetAttribute("seq_length", sequence.Len())
sequence.SetAttribute("microsat", match.String())
sequence.SetAttribute("microsat_from", match.Index)
sequence.SetAttribute("microsat_to", match.Index+match.Length-1)
sequence.SetAttribute("microsat_unit", unit)
sequence.SetAttribute("microsat_unit_normalized", normalizedUnit(unit))
sequence.SetAttribute("microsat_left", sequence.String()[0:match.Index])
sequence.SetAttribute("microsat_right", sequence.String()[match.Index+match.Length:])
return obiseq.BioSequenceSlice{sequence}, nil
}
return obiseq.SeqWorker(w)
}
func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence {
var newIter obiiter.IBioSequence
worker := MakeMicrosatWorker(CLIMinUnitLength(), CLIMaxUnitLength(), CLIMinUnitCount())
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
return newIter.FilterEmpty()
}

View File

@@ -0,0 +1,55 @@
package obimicrosat
import (
"fmt"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _MinUnitLength = 1
var _MaxUnitLength = 6
var _MinUnitCount = 5
// PCROptionSet defines every options related to a simulated PCR.
//
// The function adds to a CLI every options proposed to the user
// to tune the parametters of the PCR simulation algorithm.
//
// # Parameters
//
// - option : is a pointer to a getoptions.GetOpt instance normaly
// produced by the
func MicroSatelliteOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_MinUnitLength, "min-unit-length", _MinUnitLength,
options.Alias("m"),
options.Description("Minimum length of a microsatellite unit."))
options.IntVar(&_MaxUnitLength, "max-unit-length", _MaxUnitLength,
options.Alias("M"),
options.Description("Maximum length of a microsatellite unit."))
options.IntVar(&_MinUnitCount, "min-unit-count", _MinUnitCount,
options.Description("Minumum number of repeated units."))
}
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
MicroSatelliteOptionSet(options)
}
func CLIMinUnitLength() int {
return _MinUnitLength
}
func CLIMaxUnitLength() int {
return _MaxUnitLength
}
func CLIMinUnitCount() int {
return _MinUnitCount
}
func CLIMicroSatRegex() string {
return fmt.Sprintf("([acgt]{%d,%d})\\1{%d}", _MinUnitLength, _MaxUnitLength, _MinUnitCount-1)
}

View File

@@ -132,7 +132,7 @@ func FindClosests(sequence *obiseq.BioSequence,
lcs, alilength := -1, -1
switch maxe {
case 0:
if obiutils.UnsafeStringFreomBytes(sequence.Sequence()) == obiutils.UnsafeStringFreomBytes(references[order].Sequence()) {
if obiutils.UnsafeStringFromBytes(sequence.Sequence()) == obiutils.UnsafeStringFromBytes(references[order].Sequence()) {
score = 0
alilength = sequence.Len()
lcs = alilength
@@ -279,7 +279,7 @@ func Identify(sequence *obiseq.BioSequence,
var bestmatch string
var taxon *obitax.TaxNode
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFreomBytes(sequence.Sequence())]
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFromBytes(sequence.Sequence())]
if ok {
taxon = exacttaxon.Taxon
bestmatch = exacttaxon.Id
@@ -399,7 +399,7 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
ft[len(ft)] = taxa[i]
seqstr := obiutils.UnsafeStringFreomBytes(seq.Sequence())
seqstr := obiutils.UnsafeStringFromBytes(seq.Sequence())
em, ok := exactmatch[seqstr]
if !ok {