Adds the obimicrosat command

This commit is contained in:
Eric Coissac
2024-08-05 15:31:20 +02:00
parent 3f57935328
commit bdb96dda94
11 changed files with 419 additions and 5 deletions

View File

@ -0,0 +1,49 @@
package main
import (
"os"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obimicrosat"
)
func main() {
defer obiseq.LogBioSeqStatus()
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
// f, err := os.Create("cpu.pprof")
// if err != nil {
// log.Fatal(err)
// }
// pprof.StartCPUProfile(f)
// defer pprof.StopCPUProfile()
// go tool trace cpu.trace
// ftrace, err := os.Create("cpu.trace")
// if err != nil {
// log.Fatal(err)
// }
// trace.Start(ftrace)
// defer trace.Stop()
optionParser := obioptions.GenerateOptionParser(obimicrosat.OptionSet)
_, args := optionParser(os.Args)
sequences, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
log.Errorf("Cannot open file (%v)", err)
os.Exit(1)
}
selected := obimicrosat.CLIAnnotateMicrosat(sequences)
obiconvert.CLIWriteBioSequences(selected, true)
obiiter.WaitForLastPipe()
}

1
go.mod
View File

@ -28,6 +28,7 @@ require (
github.com/cloudwego/base64x v0.1.4 // indirect
github.com/cloudwego/iasm v0.2.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dlclark/regexp2 v1.11.4 // indirect
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
github.com/kr/pretty v0.3.0 // indirect

2
go.sum
View File

@ -20,6 +20,8 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q=
github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=

View File

@ -39,6 +39,25 @@ var iupac = map[byte][]uint64{
'n': {0, 1, 2, 3},
}
var revcompnuc = map[byte]byte{
'a': 't',
'c': 'g',
'g': 'c',
't': 'a',
'u': 'a',
'r': 'y',
'y': 'r',
's': 's',
'w': 'w',
'k': 'm',
'm': 'k',
'b': 'v',
'd': 'h',
'h': 'd',
'v': 'b',
'n': 'n',
}
var decode = map[uint64]byte{
0: 'a',
1: 'c',

176
pkg/obikmer/kmermap.go Normal file
View File

@ -0,0 +1,176 @@
package obikmer
import (
"os"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/schollz/progressbar/v3"
)
type KmerMap struct {
index map[KmerIdx64][]*obiseq.BioSequence
kmersize int
kmermask KmerIdx64
}
type KmerMatch map[*obiseq.BioSequence]int
func (k *KmerMap) KmerSize() int {
return k.kmersize
}
func (k *KmerMap) Len() int {
return len(k.index)
}
func (k *KmerMap) Push(sequence *obiseq.BioSequence) {
current := KmerIdx64(0)
ccurrent := KmerIdx64(0)
lshift := uint(2 * (k.kmersize - 1))
nuc := sequence.Sequence()
size := 0
for i := 0; i < len(nuc)-k.kmersize+1; i++ {
current <<= 2
ccurrent >>= 2
code := iupac[nuc[i]]
ccode := iupac[revcompnuc[nuc[i]]]
if len(code) != 1 {
current = KmerIdx64(0)
ccurrent = KmerIdx64(0)
size = 0
continue
}
current |= KmerIdx64(code[0])
ccurrent |= KmerIdx64(ccode[0]) << lshift
size++
if size == k.kmersize {
kmer := min(k.kmermask&current, k.kmermask&ccurrent)
k.index[kmer] = append(k.index[kmer], sequence)
size--
}
}
}
func (k *KmerMap) Query(sequence *obiseq.BioSequence) KmerMatch {
current := KmerIdx64(0)
ccurrent := KmerIdx64(0)
rep := make(KmerMatch)
nuc := sequence.Sequence()
size := 0
for i := 0; i < len(nuc)-k.kmersize+1; i++ {
current <<= 2
ccurrent >>= 2
code := iupac[nuc[i]]
ccode := iupac[revcompnuc[nuc[i]]]
if len(code) != 1 {
current = KmerIdx64(0)
ccurrent = KmerIdx64(0)
size = 0
continue
}
current |= KmerIdx64(code[0])
ccurrent |= KmerIdx64(ccode[0]) << uint(2*(k.kmersize-1))
size++
if size == k.kmersize {
kmer := min(k.kmermask&current, k.kmermask&ccurrent)
if _, ok := k.index[kmer]; ok {
for _, seq := range k.index[kmer] {
if seq != sequence {
if _, ok := rep[seq]; !ok {
rep[seq] = 0
}
rep[seq]++
}
}
}
size--
}
}
return rep
}
func (k *KmerMatch) FilterMinCount(mincount int) {
for seq, count := range *k {
if count < mincount {
delete(*k, seq)
}
}
}
func (k *KmerMatch) Len() int {
return len(*k)
}
func (k *KmerMatch) Sequences() obiseq.BioSequenceSlice {
ks := make([]*obiseq.BioSequence, 0, len(*k))
for seq := range *k {
ks = append(ks, seq)
}
return ks
}
func (k *KmerMatch) Max() *obiseq.BioSequence {
max := 0
var maxseq *obiseq.BioSequence
for seq, n := range *k {
if max < n {
max = n
maxseq = seq
}
}
return maxseq
}
func NewKmerMap(sequences obiseq.BioSequenceSlice, kmersize int) *KmerMap {
idx := make(map[KmerIdx64][]*obiseq.BioSequence)
kmermask := KmerIdx64(^(^uint64(0) << (uint64(kmersize) * 2)))
kmap := &KmerMap{kmersize: kmersize, kmermask: kmermask, index: idx}
n := len(sequences)
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowCount(),
progressbar.OptionShowIts(),
progressbar.OptionSetDescription("Indexing kmers"),
)
bar := progressbar.NewOptions(n, pbopt...)
for i, sequence := range sequences {
kmap.Push(sequence)
if i%100 == 0 {
bar.Add(100)
}
}
return kmap
}
func (k *KmerMap) MakeCountMatchWorker(minKmerCount int) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
matches := k.Query(sequence)
matches.FilterMinCount(minKmerCount)
n := matches.Len()
sequence.SetAttribute("obikmer_match_count", n)
return obiseq.BioSequenceSlice{sequence}, nil
}
}

View File

@ -7,7 +7,7 @@ import (
// TODO: The version number is extracted from git. This induces that the version
// corresponds to the last commit, and not the one when the file will be
// commited
var _Commit = "886b5d9"
var _Commit = "3f57935"
var _Version = "Release 4.2.0"
// Version returns the version of the obitools package.

View File

@ -525,3 +525,11 @@ func (s *BioSequence) Grow(length int) {
s.qualities = slices.Grow(s.qualities, length)
}
}
// SameAs checks if the sequence of the current BioSequence is the same as the sequence of the other BioSequence.
//
// other: a pointer to the other BioSequence.
// Returns a boolean indicating whether the sequences are the same.
func (s *BioSequence) SameAs(other *BioSequence) bool {
return obiutils.UnsafeStringFromBytes(s.sequence) == obiutils.UnsafeStringFromBytes(other.sequence)
}

View File

@ -0,0 +1,104 @@
package obimicrosat
import (
"fmt"
"sort"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"github.com/dlclark/regexp2"
)
func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
min_unit := func(microsat string) int {
for i := 1; i < len(microsat); i++ {
s1 := microsat[0 : len(microsat)-i]
s2 := microsat[i:]
if s1 == s2 {
return i
}
}
return 0
}
normalizedUnit := func(unit string) string {
all := make([]string, 0, len(unit)*2)
for i := 0; i < len(unit); i++ {
rotate := unit[i:] + unit[:i]
revcomp_rotate := obiseq.NewBioSequence("", []byte(rotate), "").ReverseComplement(true).String()
all = append(all, rotate, revcomp_rotate)
}
sort.Slice(all, func(i, j int) bool {
return all[i] < all[j]
})
return all[0]
}
build_regexp := func(minLength, maxLength, minUnits int) *regexp2.Regexp {
return regexp2.MustCompile(
fmt.Sprintf("([acgt]{%d,%d})\\1{%d,}",
minLength,
maxLength,
minUnits-1,
),
regexp2.RE2)
}
regexp := build_regexp(minLength, maxLength, minUnits)
w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
match, _ := regexp.FindStringMatch(sequence.String())
if match == nil {
return obiseq.BioSequenceSlice{}, nil
}
unit_length := min_unit(match.String())
if unit_length < minLength {
return obiseq.BioSequenceSlice{}, nil
}
pattern := build_regexp(unit_length, unit_length, minUnits)
match, _ = pattern.FindStringMatch(sequence.String())
unit := match.String()[0:unit_length]
sequence.SetAttribute("microsat_unit_length", unit_length)
sequence.SetAttribute("microsat_unit_count", match.Length/unit_length)
sequence.SetAttribute("seq_length", sequence.Len())
sequence.SetAttribute("microsat", match.String())
sequence.SetAttribute("microsat_from", match.Index)
sequence.SetAttribute("microsat_to", match.Index+match.Length-1)
sequence.SetAttribute("microsat_unit", unit)
sequence.SetAttribute("microsat_unit_normalized", normalizedUnit(unit))
sequence.SetAttribute("microsat_left", sequence.String()[0:match.Index])
sequence.SetAttribute("microsat_right", sequence.String()[match.Index+match.Length:])
return obiseq.BioSequenceSlice{sequence}, nil
}
return obiseq.SeqWorker(w)
}
func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence {
var newIter obiiter.IBioSequence
worker := MakeMicrosatWorker(CLIMinUnitLength(), CLIMaxUnitLength(), CLIMinUnitCount())
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
return newIter.FilterEmpty()
}

View File

@ -0,0 +1,55 @@
package obimicrosat
import (
"fmt"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _MinUnitLength = 1
var _MaxUnitLength = 6
var _MinUnitCount = 5
// PCROptionSet defines every options related to a simulated PCR.
//
// The function adds to a CLI every options proposed to the user
// to tune the parametters of the PCR simulation algorithm.
//
// # Parameters
//
// - option : is a pointer to a getoptions.GetOpt instance normaly
// produced by the
func MicroSatelliteOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_MinUnitLength, "min-unit-length", _MinUnitLength,
options.Alias("m"),
options.Description("Minimum length of a microsatellite unit."))
options.IntVar(&_MaxUnitLength, "max-unit-length", _MaxUnitLength,
options.Alias("M"),
options.Description("Maximum length of a microsatellite unit."))
options.IntVar(&_MinUnitCount, "min-unit-count", _MinUnitCount,
options.Description("Minumum number of repeated units."))
}
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
MicroSatelliteOptionSet(options)
}
func CLIMinUnitLength() int {
return _MinUnitLength
}
func CLIMaxUnitLength() int {
return _MaxUnitLength
}
func CLIMinUnitCount() int {
return _MinUnitCount
}
func CLIMicroSatRegex() string {
return fmt.Sprintf("([acgt]{%d,%d})\\1{%d}", _MinUnitLength, _MaxUnitLength, _MinUnitCount-1)
}

View File

@ -132,7 +132,7 @@ func FindClosests(sequence *obiseq.BioSequence,
lcs, alilength := -1, -1
switch maxe {
case 0:
if obiutils.UnsafeStringFreomBytes(sequence.Sequence()) == obiutils.UnsafeStringFreomBytes(references[order].Sequence()) {
if obiutils.UnsafeStringFromBytes(sequence.Sequence()) == obiutils.UnsafeStringFromBytes(references[order].Sequence()) {
score = 0
alilength = sequence.Len()
lcs = alilength
@ -279,7 +279,7 @@ func Identify(sequence *obiseq.BioSequence,
var bestmatch string
var taxon *obitax.TaxNode
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFreomBytes(sequence.Sequence())]
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFromBytes(sequence.Sequence())]
if ok {
taxon = exacttaxon.Taxon
bestmatch = exacttaxon.Id
@ -399,7 +399,7 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
ft[len(ft)] = taxa[i]
seqstr := obiutils.UnsafeStringFreomBytes(seq.Sequence())
seqstr := obiutils.UnsafeStringFromBytes(seq.Sequence())
em, ok := exactmatch[seqstr]
if !ok {

View File

@ -2,7 +2,7 @@ package obiutils
import "unsafe"
func UnsafeStringFreomBytes(data []byte) string {
func UnsafeStringFromBytes(data []byte) string {
if len(data) > 0 {
s := unsafe.String(unsafe.SliceData(data), len(data))
return s