From cd330db67223a2ced4a011e8ca3f1c36e1c83890 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Fri, 30 Aug 2024 11:16:43 +0200 Subject: [PATCH] Add option to obimicrosat to control microsat length and orientation --- pkg/obitools/obimicrosat/microsat.go | 101 +++++++++++++++++++++++---- pkg/obitools/obimicrosat/options.go | 27 +++++++ 2 files changed, 113 insertions(+), 15 deletions(-) diff --git a/pkg/obitools/obimicrosat/microsat.go b/pkg/obitools/obimicrosat/microsat.go index c2ee65a..f9fd95d 100644 --- a/pkg/obitools/obimicrosat/microsat.go +++ b/pkg/obitools/obimicrosat/microsat.go @@ -10,7 +10,33 @@ import ( "github.com/dlclark/regexp2" ) -func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker { +// MakeMicrosatWorker creates a SeqWorker that finds microsatellite regions in a BioSequence. +// +// The function takes three integer parameters: minLength, maxLength, and minUnits. minLength specifies the +// minimum length of the microsatellite region, maxLength specifies the maximum length, and minUnits specifies +// the minimum number of repeating units. The function returns an obiseq.SeqWorker, which is a Go function that +// takes a BioSequence as input and returns a BioSequenceSlice and an error. The SeqWorker performs the following +// steps: +// 1. It defines two helper functions: min_unit and normalizedUnit. +// 2. It defines a regular expression pattern based on the input parameters. +// 3. It defines the SeqWorker function w. +// 4. The w function searches for a match of the regular expression pattern in the sequence string. +// 5. If no match is found, it returns an empty BioSequenceSlice and nil error. +// 6. It calculates the length of the matching unit. +// 7. It checks if the unit length is less than minLength. +// 8. It creates a new regular expression pattern based on the unit length. +// 9. It extracts the matching unit from the sequence string. +// 10. It sets various attributes on the sequence. +// 11. It returns a BioSequenceSlice containing the original sequence and nil error. +// +// Parameters: +// - minLength: the minimum length of the microsatellite region. +// - maxLength: the maximum length of the microsatellite region. +// - minUnits: the minimum number of repeating units. +// +// Return type: +// - obiseq.SeqWorker: a Go function that takes a BioSequence as input and returns a BioSequenceSlice and an error. +func MakeMicrosatWorker(minUnitLength, maxUnitLength, minUnits, minLength, minflankLength int, reoriented bool) obiseq.SeqWorker { min_unit := func(microsat string) int { for i := 1; i < len(microsat); i++ { @@ -25,20 +51,30 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker { return 0 } - normalizedUnit := func(unit string) string { - all := make([]string, 0, len(unit)*2) + normalizedUnit := func(unit string) (string, bool) { + all := make([]struct { + unit string + direct bool + }, 0, len(unit)*2) for i := 0; i < len(unit); i++ { rotate := unit[i:] + unit[:i] revcomp_rotate := obiseq.NewBioSequence("", []byte(rotate), "").ReverseComplement(true).String() - all = append(all, rotate, revcomp_rotate) + all = append(all, struct { + unit string + direct bool + }{unit: rotate, direct: true}, + struct { + unit string + direct bool + }{unit: revcomp_rotate, direct: false}) } sort.Slice(all, func(i, j int) bool { - return all[i] < all[j] + return all[i].unit < all[j].unit }) - return all[0] + return all[0].unit, all[0].direct } build_regexp := func(minLength, maxLength, minUnits int) *regexp2.Regexp { @@ -51,7 +87,7 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker { regexp2.RE2) } - regexp := build_regexp(minLength, maxLength, minUnits) + regexp := build_regexp(minUnitLength, maxUnitLength, minUnits) w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { @@ -63,7 +99,7 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker { unit_length := min_unit(match.String()) - if unit_length < minLength { + if unit_length < minUnitLength { return obiseq.BioSequenceSlice{}, nil } @@ -71,20 +107,46 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker { match, _ = pattern.FindStringMatch(sequence.String()) + if match.Length < minLength { + return obiseq.BioSequenceSlice{}, nil + } + unit := match.String()[0:unit_length] + normalized, direct := normalizedUnit(unit) + matchFrom := match.Index + + if !direct && reoriented { + sequence = sequence.ReverseComplement(true) + sequence.SetId(sequence.Id() + "_cmp") + matchFrom = sequence.Len() - match.Index - match.Length + } + + matchTo := matchFrom + match.Length + + microsat := sequence.String()[matchFrom:matchTo] + unit = microsat[0:unit_length] + + left := sequence.String()[0:matchFrom] + right := sequence.String()[matchTo:] + + if len(left) < minflankLength || len(right) < minflankLength { + return obiseq.BioSequenceSlice{}, nil + } sequence.SetAttribute("microsat_unit_length", unit_length) sequence.SetAttribute("microsat_unit_count", match.Length/unit_length) sequence.SetAttribute("seq_length", sequence.Len()) - sequence.SetAttribute("microsat", match.String()) - sequence.SetAttribute("microsat_from", match.Index) - sequence.SetAttribute("microsat_to", match.Index+match.Length-1) + sequence.SetAttribute("microsat", microsat) + sequence.SetAttribute("microsat_from", matchFrom+1) + sequence.SetAttribute("microsat_to", matchTo) sequence.SetAttribute("microsat_unit", unit) - sequence.SetAttribute("microsat_unit_normalized", normalizedUnit(unit)) + sequence.SetAttribute("microsat_unit_normalized", normalized) - sequence.SetAttribute("microsat_left", sequence.String()[0:match.Index]) - sequence.SetAttribute("microsat_right", sequence.String()[match.Index+match.Length:]) + sequence.SetAttribute("microsat_unit_orientation", map[bool]string{true: "direct", false: "reverse"}[direct]) + + sequence.SetAttribute("microsat_left", left) + sequence.SetAttribute("microsat_right", right) return obiseq.BioSequenceSlice{sequence}, nil } @@ -92,10 +154,19 @@ func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker { return obiseq.SeqWorker(w) } +// CLIAnnotateMicrosat is a function that annotates microsatellites in a given sequence. +// +// It takes an iterator of type `obiiter.IBioSequence` as a parameter. +// The function returns an iterator of type `obiiter.IBioSequence`. func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence { var newIter obiiter.IBioSequence - worker := MakeMicrosatWorker(CLIMinUnitLength(), CLIMaxUnitLength(), CLIMinUnitCount()) + worker := MakeMicrosatWorker(CLIMinUnitLength(), + CLIMaxUnitLength(), + CLIMinUnitCount(), + CLIMinLength(), + CLIMinFlankLength(), + CLIReoriented()) newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers()) diff --git a/pkg/obitools/obimicrosat/options.go b/pkg/obitools/obimicrosat/options.go index 86a19cc..5d22604 100644 --- a/pkg/obitools/obimicrosat/options.go +++ b/pkg/obitools/obimicrosat/options.go @@ -10,6 +10,9 @@ import ( var _MinUnitLength = 1 var _MaxUnitLength = 6 var _MinUnitCount = 5 +var _MinLength = 20 +var _MinFlankLength = 0 +var _NotReoriented = false // PCROptionSet defines every options related to a simulated PCR. // @@ -31,6 +34,18 @@ func MicroSatelliteOptionSet(options *getoptions.GetOpt) { options.IntVar(&_MinUnitCount, "min-unit-count", _MinUnitCount, options.Description("Minumum number of repeated units.")) + + options.IntVar(&_MinLength, "min-length", _MinLength, + options.Alias("l"), + options.Description("Minimum length of a microsatellite.")) + + options.IntVar(&_MinFlankLength, "min-flank-length", _MinFlankLength, + options.Alias("f"), + options.Description("Minimum length of the flanking sequences.")) + + options.BoolVar(&_NotReoriented, "not-reoriented", _NotReoriented, + options.Alias("n"), + options.Description("Do not reorient the microsatellites.")) } func OptionSet(options *getoptions.GetOpt) { @@ -53,3 +68,15 @@ func CLIMinUnitCount() int { func CLIMicroSatRegex() string { return fmt.Sprintf("([acgt]{%d,%d})\\1{%d}", _MinUnitLength, _MaxUnitLength, _MinUnitCount-1) } + +func CLIMinLength() int { + return _MinLength +} + +func CLIMinFlankLength() int { + return _MinFlankLength +} + +func CLIReoriented() bool { + return !_NotReoriented +}