From bdb96dda9417cde95dd23ff5824edba46e25ab99 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Mon, 5 Aug 2024 15:31:20 +0200 Subject: [PATCH] Adds the obimicrosat command --- cmd/obitools/obimicrosat/main.go | 49 ++++++++ go.mod | 1 + go.sum | 2 + pkg/obikmer/debruijn.go | 19 +++ pkg/obikmer/kmermap.go | 176 +++++++++++++++++++++++++++ pkg/obioptions/version.go | 2 +- pkg/obiseq/biosequence.go | 8 ++ pkg/obitools/obimicrosat/microsat.go | 104 ++++++++++++++++ pkg/obitools/obimicrosat/options.go | 55 +++++++++ pkg/obitools/obitag2/obitag.go | 6 +- pkg/obiutils/strings.go | 2 +- 11 files changed, 419 insertions(+), 5 deletions(-) create mode 100644 cmd/obitools/obimicrosat/main.go create mode 100644 pkg/obikmer/kmermap.go create mode 100644 pkg/obitools/obimicrosat/microsat.go create mode 100644 pkg/obitools/obimicrosat/options.go diff --git a/cmd/obitools/obimicrosat/main.go b/cmd/obitools/obimicrosat/main.go new file mode 100644 index 0000000..5035ff0 --- /dev/null +++ b/cmd/obitools/obimicrosat/main.go @@ -0,0 +1,49 @@ +package main + +import ( + "os" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obimicrosat" +) + +func main() { + + defer obiseq.LogBioSeqStatus() + + // go tool pprof -http=":8000" ./obipairing ./cpu.pprof + // f, err := os.Create("cpu.pprof") + // if err != nil { + // log.Fatal(err) + // } + // pprof.StartCPUProfile(f) + // defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() + + optionParser := obioptions.GenerateOptionParser(obimicrosat.OptionSet) + + _, args := optionParser(os.Args) + + sequences, err := obiconvert.CLIReadBioSequences(args...) + + if err != nil { + log.Errorf("Cannot open file (%v)", err) + os.Exit(1) + } + selected := obimicrosat.CLIAnnotateMicrosat(sequences) + obiconvert.CLIWriteBioSequences(selected, true) + obiiter.WaitForLastPipe() + +} diff --git a/go.mod b/go.mod index 81aed9d..88c315d 100644 --- a/go.mod +++ b/go.mod @@ -28,6 +28,7 @@ require ( github.com/cloudwego/base64x v0.1.4 // indirect github.com/cloudwego/iasm v0.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect + github.com/dlclark/regexp2 v1.11.4 // indirect github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect github.com/klauspost/cpuid/v2 v2.0.9 // indirect github.com/kr/pretty v0.3.0 // indirect diff --git a/go.sum b/go.sum index 6ae044b..fa803db 100644 --- a/go.sum +++ b/go.sum @@ -20,6 +20,8 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo= +github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q= github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo= github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY= diff --git a/pkg/obikmer/debruijn.go b/pkg/obikmer/debruijn.go index 133304c..8ed0fc6 100644 --- a/pkg/obikmer/debruijn.go +++ b/pkg/obikmer/debruijn.go @@ -39,6 +39,25 @@ var iupac = map[byte][]uint64{ 'n': {0, 1, 2, 3}, } +var revcompnuc = map[byte]byte{ + 'a': 't', + 'c': 'g', + 'g': 'c', + 't': 'a', + 'u': 'a', + 'r': 'y', + 'y': 'r', + 's': 's', + 'w': 'w', + 'k': 'm', + 'm': 'k', + 'b': 'v', + 'd': 'h', + 'h': 'd', + 'v': 'b', + 'n': 'n', +} + var decode = map[uint64]byte{ 0: 'a', 1: 'c', diff --git a/pkg/obikmer/kmermap.go b/pkg/obikmer/kmermap.go new file mode 100644 index 0000000..1511214 --- /dev/null +++ b/pkg/obikmer/kmermap.go @@ -0,0 +1,176 @@ +package obikmer + +import ( + "os" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "github.com/schollz/progressbar/v3" +) + +type KmerMap struct { + index map[KmerIdx64][]*obiseq.BioSequence + kmersize int + kmermask KmerIdx64 +} + +type KmerMatch map[*obiseq.BioSequence]int + +func (k *KmerMap) KmerSize() int { + return k.kmersize +} + +func (k *KmerMap) Len() int { + return len(k.index) +} + +func (k *KmerMap) Push(sequence *obiseq.BioSequence) { + current := KmerIdx64(0) + ccurrent := KmerIdx64(0) + lshift := uint(2 * (k.kmersize - 1)) + + nuc := sequence.Sequence() + size := 0 + for i := 0; i < len(nuc)-k.kmersize+1; i++ { + current <<= 2 + ccurrent >>= 2 + code := iupac[nuc[i]] + ccode := iupac[revcompnuc[nuc[i]]] + + if len(code) != 1 { + current = KmerIdx64(0) + ccurrent = KmerIdx64(0) + size = 0 + continue + } + + current |= KmerIdx64(code[0]) + ccurrent |= KmerIdx64(ccode[0]) << lshift + size++ + + if size == k.kmersize { + + kmer := min(k.kmermask¤t, k.kmermask&ccurrent) + k.index[kmer] = append(k.index[kmer], sequence) + size-- + } + } +} + +func (k *KmerMap) Query(sequence *obiseq.BioSequence) KmerMatch { + current := KmerIdx64(0) + ccurrent := KmerIdx64(0) + + rep := make(KmerMatch) + + nuc := sequence.Sequence() + size := 0 + for i := 0; i < len(nuc)-k.kmersize+1; i++ { + current <<= 2 + ccurrent >>= 2 + + code := iupac[nuc[i]] + ccode := iupac[revcompnuc[nuc[i]]] + + if len(code) != 1 { + current = KmerIdx64(0) + ccurrent = KmerIdx64(0) + size = 0 + continue + } + + current |= KmerIdx64(code[0]) + ccurrent |= KmerIdx64(ccode[0]) << uint(2*(k.kmersize-1)) + size++ + + if size == k.kmersize { + kmer := min(k.kmermask¤t, k.kmermask&ccurrent) + if _, ok := k.index[kmer]; ok { + for _, seq := range k.index[kmer] { + if seq != sequence { + if _, ok := rep[seq]; !ok { + rep[seq] = 0 + } + rep[seq]++ + } + } + } + size-- + } + } + + return rep +} + +func (k *KmerMatch) FilterMinCount(mincount int) { + for seq, count := range *k { + if count < mincount { + delete(*k, seq) + } + } +} + +func (k *KmerMatch) Len() int { + return len(*k) +} + +func (k *KmerMatch) Sequences() obiseq.BioSequenceSlice { + ks := make([]*obiseq.BioSequence, 0, len(*k)) + + for seq := range *k { + ks = append(ks, seq) + } + + return ks +} + +func (k *KmerMatch) Max() *obiseq.BioSequence { + max := 0 + var maxseq *obiseq.BioSequence + for seq, n := range *k { + if max < n { + max = n + maxseq = seq + } + } + return maxseq +} + +func NewKmerMap(sequences obiseq.BioSequenceSlice, kmersize int) *KmerMap { + idx := make(map[KmerIdx64][]*obiseq.BioSequence) + + kmermask := KmerIdx64(^(^uint64(0) << (uint64(kmersize) * 2))) + + kmap := &KmerMap{kmersize: kmersize, kmermask: kmermask, index: idx} + + n := len(sequences) + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowCount(), + progressbar.OptionShowIts(), + progressbar.OptionSetDescription("Indexing kmers"), + ) + + bar := progressbar.NewOptions(n, pbopt...) + + for i, sequence := range sequences { + kmap.Push(sequence) + if i%100 == 0 { + bar.Add(100) + } + } + + return kmap +} + +func (k *KmerMap) MakeCountMatchWorker(minKmerCount int) obiseq.SeqWorker { + return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + matches := k.Query(sequence) + matches.FilterMinCount(minKmerCount) + n := matches.Len() + + sequence.SetAttribute("obikmer_match_count", n) + return obiseq.BioSequenceSlice{sequence}, nil + } +} diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index a9b1642..8053eaa 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "886b5d9" +var _Commit = "3f57935" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index ee17212..00d2bd9 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -525,3 +525,11 @@ func (s *BioSequence) Grow(length int) { s.qualities = slices.Grow(s.qualities, length) } } + +// SameAs checks if the sequence of the current BioSequence is the same as the sequence of the other BioSequence. +// +// other: a pointer to the other BioSequence. +// Returns a boolean indicating whether the sequences are the same. +func (s *BioSequence) SameAs(other *BioSequence) bool { + return obiutils.UnsafeStringFromBytes(s.sequence) == obiutils.UnsafeStringFromBytes(other.sequence) +} diff --git a/pkg/obitools/obimicrosat/microsat.go b/pkg/obitools/obimicrosat/microsat.go new file mode 100644 index 0000000..c2ee65a --- /dev/null +++ b/pkg/obitools/obimicrosat/microsat.go @@ -0,0 +1,104 @@ +package obimicrosat + +import ( + "fmt" + "sort" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "github.com/dlclark/regexp2" +) + +func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker { + + min_unit := func(microsat string) int { + for i := 1; i < len(microsat); i++ { + s1 := microsat[0 : len(microsat)-i] + s2 := microsat[i:] + + if s1 == s2 { + return i + } + } + + return 0 + } + + normalizedUnit := func(unit string) string { + all := make([]string, 0, len(unit)*2) + + for i := 0; i < len(unit); i++ { + rotate := unit[i:] + unit[:i] + revcomp_rotate := obiseq.NewBioSequence("", []byte(rotate), "").ReverseComplement(true).String() + all = append(all, rotate, revcomp_rotate) + } + + sort.Slice(all, func(i, j int) bool { + return all[i] < all[j] + }) + + return all[0] + } + + build_regexp := func(minLength, maxLength, minUnits int) *regexp2.Regexp { + return regexp2.MustCompile( + fmt.Sprintf("([acgt]{%d,%d})\\1{%d,}", + minLength, + maxLength, + minUnits-1, + ), + regexp2.RE2) + } + + regexp := build_regexp(minLength, maxLength, minUnits) + + w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + + match, _ := regexp.FindStringMatch(sequence.String()) + + if match == nil { + return obiseq.BioSequenceSlice{}, nil + } + + unit_length := min_unit(match.String()) + + if unit_length < minLength { + return obiseq.BioSequenceSlice{}, nil + } + + pattern := build_regexp(unit_length, unit_length, minUnits) + + match, _ = pattern.FindStringMatch(sequence.String()) + + unit := match.String()[0:unit_length] + + sequence.SetAttribute("microsat_unit_length", unit_length) + sequence.SetAttribute("microsat_unit_count", match.Length/unit_length) + sequence.SetAttribute("seq_length", sequence.Len()) + sequence.SetAttribute("microsat", match.String()) + sequence.SetAttribute("microsat_from", match.Index) + sequence.SetAttribute("microsat_to", match.Index+match.Length-1) + + sequence.SetAttribute("microsat_unit", unit) + sequence.SetAttribute("microsat_unit_normalized", normalizedUnit(unit)) + + sequence.SetAttribute("microsat_left", sequence.String()[0:match.Index]) + sequence.SetAttribute("microsat_right", sequence.String()[match.Index+match.Length:]) + + return obiseq.BioSequenceSlice{sequence}, nil + } + + return obiseq.SeqWorker(w) +} + +func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence { + var newIter obiiter.IBioSequence + + worker := MakeMicrosatWorker(CLIMinUnitLength(), CLIMaxUnitLength(), CLIMinUnitCount()) + + newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers()) + + return newIter.FilterEmpty() + +} diff --git a/pkg/obitools/obimicrosat/options.go b/pkg/obitools/obimicrosat/options.go new file mode 100644 index 0000000..86a19cc --- /dev/null +++ b/pkg/obitools/obimicrosat/options.go @@ -0,0 +1,55 @@ +package obimicrosat + +import ( + "fmt" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _MinUnitLength = 1 +var _MaxUnitLength = 6 +var _MinUnitCount = 5 + +// PCROptionSet defines every options related to a simulated PCR. +// +// The function adds to a CLI every options proposed to the user +// to tune the parametters of the PCR simulation algorithm. +// +// # Parameters +// +// - option : is a pointer to a getoptions.GetOpt instance normaly +// produced by the +func MicroSatelliteOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_MinUnitLength, "min-unit-length", _MinUnitLength, + options.Alias("m"), + options.Description("Minimum length of a microsatellite unit.")) + + options.IntVar(&_MaxUnitLength, "max-unit-length", _MaxUnitLength, + options.Alias("M"), + options.Description("Maximum length of a microsatellite unit.")) + + options.IntVar(&_MinUnitCount, "min-unit-count", _MinUnitCount, + options.Description("Minumum number of repeated units.")) +} + +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + MicroSatelliteOptionSet(options) +} + +func CLIMinUnitLength() int { + return _MinUnitLength +} + +func CLIMaxUnitLength() int { + return _MaxUnitLength +} + +func CLIMinUnitCount() int { + return _MinUnitCount +} + +func CLIMicroSatRegex() string { + return fmt.Sprintf("([acgt]{%d,%d})\\1{%d}", _MinUnitLength, _MaxUnitLength, _MinUnitCount-1) +} diff --git a/pkg/obitools/obitag2/obitag.go b/pkg/obitools/obitag2/obitag.go index 00af2e3..5dc8438 100644 --- a/pkg/obitools/obitag2/obitag.go +++ b/pkg/obitools/obitag2/obitag.go @@ -132,7 +132,7 @@ func FindClosests(sequence *obiseq.BioSequence, lcs, alilength := -1, -1 switch maxe { case 0: - if obiutils.UnsafeStringFreomBytes(sequence.Sequence()) == obiutils.UnsafeStringFreomBytes(references[order].Sequence()) { + if obiutils.UnsafeStringFromBytes(sequence.Sequence()) == obiutils.UnsafeStringFromBytes(references[order].Sequence()) { score = 0 alilength = sequence.Len() lcs = alilength @@ -279,7 +279,7 @@ func Identify(sequence *obiseq.BioSequence, var bestmatch string var taxon *obitax.TaxNode - exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFreomBytes(sequence.Sequence())] + exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFromBytes(sequence.Sequence())] if ok { taxon = exacttaxon.Taxon bestmatch = exacttaxon.Id @@ -399,7 +399,7 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence, ft[len(ft)] = taxa[i] - seqstr := obiutils.UnsafeStringFreomBytes(seq.Sequence()) + seqstr := obiutils.UnsafeStringFromBytes(seq.Sequence()) em, ok := exactmatch[seqstr] if !ok { diff --git a/pkg/obiutils/strings.go b/pkg/obiutils/strings.go index fc3d576..3182031 100644 --- a/pkg/obiutils/strings.go +++ b/pkg/obiutils/strings.go @@ -2,7 +2,7 @@ package obiutils import "unsafe" -func UnsafeStringFreomBytes(data []byte) string { +func UnsafeStringFromBytes(data []byte) string { if len(data) > 0 { s := unsafe.String(unsafe.SliceData(data), len(data)) return s