mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Adds the obimicrosat command
This commit is contained in:
49
cmd/obitools/obimicrosat/main.go
Normal file
49
cmd/obitools/obimicrosat/main.go
Normal file
@ -0,0 +1,49 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obimicrosat"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
defer obiseq.LogBioSeqStatus()
|
||||
|
||||
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
|
||||
// f, err := os.Create("cpu.pprof")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// pprof.StartCPUProfile(f)
|
||||
// defer pprof.StopCPUProfile()
|
||||
|
||||
// go tool trace cpu.trace
|
||||
// ftrace, err := os.Create("cpu.trace")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// trace.Start(ftrace)
|
||||
// defer trace.Stop()
|
||||
|
||||
optionParser := obioptions.GenerateOptionParser(obimicrosat.OptionSet)
|
||||
|
||||
_, args := optionParser(os.Args)
|
||||
|
||||
sequences, err := obiconvert.CLIReadBioSequences(args...)
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Cannot open file (%v)", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
selected := obimicrosat.CLIAnnotateMicrosat(sequences)
|
||||
obiconvert.CLIWriteBioSequences(selected, true)
|
||||
obiiter.WaitForLastPipe()
|
||||
|
||||
}
|
1
go.mod
1
go.mod
@ -28,6 +28,7 @@ require (
|
||||
github.com/cloudwego/base64x v0.1.4 // indirect
|
||||
github.com/cloudwego/iasm v0.2.0 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dlclark/regexp2 v1.11.4 // indirect
|
||||
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
|
||||
github.com/klauspost/cpuid/v2 v2.0.9 // indirect
|
||||
github.com/kr/pretty v0.3.0 // indirect
|
||||
|
2
go.sum
2
go.sum
@ -20,6 +20,8 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dlclark/regexp2 v1.11.4 h1:rPYF9/LECdNymJufQKmri9gV604RvvABwgOA8un7yAo=
|
||||
github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
|
||||
github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q=
|
||||
github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
|
||||
github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
|
||||
|
@ -39,6 +39,25 @@ var iupac = map[byte][]uint64{
|
||||
'n': {0, 1, 2, 3},
|
||||
}
|
||||
|
||||
var revcompnuc = map[byte]byte{
|
||||
'a': 't',
|
||||
'c': 'g',
|
||||
'g': 'c',
|
||||
't': 'a',
|
||||
'u': 'a',
|
||||
'r': 'y',
|
||||
'y': 'r',
|
||||
's': 's',
|
||||
'w': 'w',
|
||||
'k': 'm',
|
||||
'm': 'k',
|
||||
'b': 'v',
|
||||
'd': 'h',
|
||||
'h': 'd',
|
||||
'v': 'b',
|
||||
'n': 'n',
|
||||
}
|
||||
|
||||
var decode = map[uint64]byte{
|
||||
0: 'a',
|
||||
1: 'c',
|
||||
|
176
pkg/obikmer/kmermap.go
Normal file
176
pkg/obikmer/kmermap.go
Normal file
@ -0,0 +1,176 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
|
||||
type KmerMap struct {
|
||||
index map[KmerIdx64][]*obiseq.BioSequence
|
||||
kmersize int
|
||||
kmermask KmerIdx64
|
||||
}
|
||||
|
||||
type KmerMatch map[*obiseq.BioSequence]int
|
||||
|
||||
func (k *KmerMap) KmerSize() int {
|
||||
return k.kmersize
|
||||
}
|
||||
|
||||
func (k *KmerMap) Len() int {
|
||||
return len(k.index)
|
||||
}
|
||||
|
||||
func (k *KmerMap) Push(sequence *obiseq.BioSequence) {
|
||||
current := KmerIdx64(0)
|
||||
ccurrent := KmerIdx64(0)
|
||||
lshift := uint(2 * (k.kmersize - 1))
|
||||
|
||||
nuc := sequence.Sequence()
|
||||
size := 0
|
||||
for i := 0; i < len(nuc)-k.kmersize+1; i++ {
|
||||
current <<= 2
|
||||
ccurrent >>= 2
|
||||
code := iupac[nuc[i]]
|
||||
ccode := iupac[revcompnuc[nuc[i]]]
|
||||
|
||||
if len(code) != 1 {
|
||||
current = KmerIdx64(0)
|
||||
ccurrent = KmerIdx64(0)
|
||||
size = 0
|
||||
continue
|
||||
}
|
||||
|
||||
current |= KmerIdx64(code[0])
|
||||
ccurrent |= KmerIdx64(ccode[0]) << lshift
|
||||
size++
|
||||
|
||||
if size == k.kmersize {
|
||||
|
||||
kmer := min(k.kmermask¤t, k.kmermask&ccurrent)
|
||||
k.index[kmer] = append(k.index[kmer], sequence)
|
||||
size--
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (k *KmerMap) Query(sequence *obiseq.BioSequence) KmerMatch {
|
||||
current := KmerIdx64(0)
|
||||
ccurrent := KmerIdx64(0)
|
||||
|
||||
rep := make(KmerMatch)
|
||||
|
||||
nuc := sequence.Sequence()
|
||||
size := 0
|
||||
for i := 0; i < len(nuc)-k.kmersize+1; i++ {
|
||||
current <<= 2
|
||||
ccurrent >>= 2
|
||||
|
||||
code := iupac[nuc[i]]
|
||||
ccode := iupac[revcompnuc[nuc[i]]]
|
||||
|
||||
if len(code) != 1 {
|
||||
current = KmerIdx64(0)
|
||||
ccurrent = KmerIdx64(0)
|
||||
size = 0
|
||||
continue
|
||||
}
|
||||
|
||||
current |= KmerIdx64(code[0])
|
||||
ccurrent |= KmerIdx64(ccode[0]) << uint(2*(k.kmersize-1))
|
||||
size++
|
||||
|
||||
if size == k.kmersize {
|
||||
kmer := min(k.kmermask¤t, k.kmermask&ccurrent)
|
||||
if _, ok := k.index[kmer]; ok {
|
||||
for _, seq := range k.index[kmer] {
|
||||
if seq != sequence {
|
||||
if _, ok := rep[seq]; !ok {
|
||||
rep[seq] = 0
|
||||
}
|
||||
rep[seq]++
|
||||
}
|
||||
}
|
||||
}
|
||||
size--
|
||||
}
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func (k *KmerMatch) FilterMinCount(mincount int) {
|
||||
for seq, count := range *k {
|
||||
if count < mincount {
|
||||
delete(*k, seq)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (k *KmerMatch) Len() int {
|
||||
return len(*k)
|
||||
}
|
||||
|
||||
func (k *KmerMatch) Sequences() obiseq.BioSequenceSlice {
|
||||
ks := make([]*obiseq.BioSequence, 0, len(*k))
|
||||
|
||||
for seq := range *k {
|
||||
ks = append(ks, seq)
|
||||
}
|
||||
|
||||
return ks
|
||||
}
|
||||
|
||||
func (k *KmerMatch) Max() *obiseq.BioSequence {
|
||||
max := 0
|
||||
var maxseq *obiseq.BioSequence
|
||||
for seq, n := range *k {
|
||||
if max < n {
|
||||
max = n
|
||||
maxseq = seq
|
||||
}
|
||||
}
|
||||
return maxseq
|
||||
}
|
||||
|
||||
func NewKmerMap(sequences obiseq.BioSequenceSlice, kmersize int) *KmerMap {
|
||||
idx := make(map[KmerIdx64][]*obiseq.BioSequence)
|
||||
|
||||
kmermask := KmerIdx64(^(^uint64(0) << (uint64(kmersize) * 2)))
|
||||
|
||||
kmap := &KmerMap{kmersize: kmersize, kmermask: kmermask, index: idx}
|
||||
|
||||
n := len(sequences)
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
progressbar.OptionSetWriter(os.Stderr),
|
||||
progressbar.OptionSetWidth(15),
|
||||
progressbar.OptionShowCount(),
|
||||
progressbar.OptionShowIts(),
|
||||
progressbar.OptionSetDescription("Indexing kmers"),
|
||||
)
|
||||
|
||||
bar := progressbar.NewOptions(n, pbopt...)
|
||||
|
||||
for i, sequence := range sequences {
|
||||
kmap.Push(sequence)
|
||||
if i%100 == 0 {
|
||||
bar.Add(100)
|
||||
}
|
||||
}
|
||||
|
||||
return kmap
|
||||
}
|
||||
|
||||
func (k *KmerMap) MakeCountMatchWorker(minKmerCount int) obiseq.SeqWorker {
|
||||
return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
matches := k.Query(sequence)
|
||||
matches.FilterMinCount(minKmerCount)
|
||||
n := matches.Len()
|
||||
|
||||
sequence.SetAttribute("obikmer_match_count", n)
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
}
|
@ -7,7 +7,7 @@ import (
|
||||
// TODO: The version number is extracted from git. This induces that the version
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
var _Commit = "886b5d9"
|
||||
var _Commit = "3f57935"
|
||||
var _Version = "Release 4.2.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
@ -525,3 +525,11 @@ func (s *BioSequence) Grow(length int) {
|
||||
s.qualities = slices.Grow(s.qualities, length)
|
||||
}
|
||||
}
|
||||
|
||||
// SameAs checks if the sequence of the current BioSequence is the same as the sequence of the other BioSequence.
|
||||
//
|
||||
// other: a pointer to the other BioSequence.
|
||||
// Returns a boolean indicating whether the sequences are the same.
|
||||
func (s *BioSequence) SameAs(other *BioSequence) bool {
|
||||
return obiutils.UnsafeStringFromBytes(s.sequence) == obiutils.UnsafeStringFromBytes(other.sequence)
|
||||
}
|
||||
|
104
pkg/obitools/obimicrosat/microsat.go
Normal file
104
pkg/obitools/obimicrosat/microsat.go
Normal file
@ -0,0 +1,104 @@
|
||||
package obimicrosat
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"github.com/dlclark/regexp2"
|
||||
)
|
||||
|
||||
func MakeMicrosatWorker(minLength, maxLength, minUnits int) obiseq.SeqWorker {
|
||||
|
||||
min_unit := func(microsat string) int {
|
||||
for i := 1; i < len(microsat); i++ {
|
||||
s1 := microsat[0 : len(microsat)-i]
|
||||
s2 := microsat[i:]
|
||||
|
||||
if s1 == s2 {
|
||||
return i
|
||||
}
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
normalizedUnit := func(unit string) string {
|
||||
all := make([]string, 0, len(unit)*2)
|
||||
|
||||
for i := 0; i < len(unit); i++ {
|
||||
rotate := unit[i:] + unit[:i]
|
||||
revcomp_rotate := obiseq.NewBioSequence("", []byte(rotate), "").ReverseComplement(true).String()
|
||||
all = append(all, rotate, revcomp_rotate)
|
||||
}
|
||||
|
||||
sort.Slice(all, func(i, j int) bool {
|
||||
return all[i] < all[j]
|
||||
})
|
||||
|
||||
return all[0]
|
||||
}
|
||||
|
||||
build_regexp := func(minLength, maxLength, minUnits int) *regexp2.Regexp {
|
||||
return regexp2.MustCompile(
|
||||
fmt.Sprintf("([acgt]{%d,%d})\\1{%d,}",
|
||||
minLength,
|
||||
maxLength,
|
||||
minUnits-1,
|
||||
),
|
||||
regexp2.RE2)
|
||||
}
|
||||
|
||||
regexp := build_regexp(minLength, maxLength, minUnits)
|
||||
|
||||
w := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
match, _ := regexp.FindStringMatch(sequence.String())
|
||||
|
||||
if match == nil {
|
||||
return obiseq.BioSequenceSlice{}, nil
|
||||
}
|
||||
|
||||
unit_length := min_unit(match.String())
|
||||
|
||||
if unit_length < minLength {
|
||||
return obiseq.BioSequenceSlice{}, nil
|
||||
}
|
||||
|
||||
pattern := build_regexp(unit_length, unit_length, minUnits)
|
||||
|
||||
match, _ = pattern.FindStringMatch(sequence.String())
|
||||
|
||||
unit := match.String()[0:unit_length]
|
||||
|
||||
sequence.SetAttribute("microsat_unit_length", unit_length)
|
||||
sequence.SetAttribute("microsat_unit_count", match.Length/unit_length)
|
||||
sequence.SetAttribute("seq_length", sequence.Len())
|
||||
sequence.SetAttribute("microsat", match.String())
|
||||
sequence.SetAttribute("microsat_from", match.Index)
|
||||
sequence.SetAttribute("microsat_to", match.Index+match.Length-1)
|
||||
|
||||
sequence.SetAttribute("microsat_unit", unit)
|
||||
sequence.SetAttribute("microsat_unit_normalized", normalizedUnit(unit))
|
||||
|
||||
sequence.SetAttribute("microsat_left", sequence.String()[0:match.Index])
|
||||
sequence.SetAttribute("microsat_right", sequence.String()[match.Index+match.Length:])
|
||||
|
||||
return obiseq.BioSequenceSlice{sequence}, nil
|
||||
}
|
||||
|
||||
return obiseq.SeqWorker(w)
|
||||
}
|
||||
|
||||
func CLIAnnotateMicrosat(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
var newIter obiiter.IBioSequence
|
||||
|
||||
worker := MakeMicrosatWorker(CLIMinUnitLength(), CLIMaxUnitLength(), CLIMinUnitCount())
|
||||
|
||||
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
|
||||
|
||||
return newIter.FilterEmpty()
|
||||
|
||||
}
|
55
pkg/obitools/obimicrosat/options.go
Normal file
55
pkg/obitools/obimicrosat/options.go
Normal file
@ -0,0 +1,55 @@
|
||||
package obimicrosat
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
var _MinUnitLength = 1
|
||||
var _MaxUnitLength = 6
|
||||
var _MinUnitCount = 5
|
||||
|
||||
// PCROptionSet defines every options related to a simulated PCR.
|
||||
//
|
||||
// The function adds to a CLI every options proposed to the user
|
||||
// to tune the parametters of the PCR simulation algorithm.
|
||||
//
|
||||
// # Parameters
|
||||
//
|
||||
// - option : is a pointer to a getoptions.GetOpt instance normaly
|
||||
// produced by the
|
||||
func MicroSatelliteOptionSet(options *getoptions.GetOpt) {
|
||||
options.IntVar(&_MinUnitLength, "min-unit-length", _MinUnitLength,
|
||||
options.Alias("m"),
|
||||
options.Description("Minimum length of a microsatellite unit."))
|
||||
|
||||
options.IntVar(&_MaxUnitLength, "max-unit-length", _MaxUnitLength,
|
||||
options.Alias("M"),
|
||||
options.Description("Maximum length of a microsatellite unit."))
|
||||
|
||||
options.IntVar(&_MinUnitCount, "min-unit-count", _MinUnitCount,
|
||||
options.Description("Minumum number of repeated units."))
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
MicroSatelliteOptionSet(options)
|
||||
}
|
||||
|
||||
func CLIMinUnitLength() int {
|
||||
return _MinUnitLength
|
||||
}
|
||||
|
||||
func CLIMaxUnitLength() int {
|
||||
return _MaxUnitLength
|
||||
}
|
||||
|
||||
func CLIMinUnitCount() int {
|
||||
return _MinUnitCount
|
||||
}
|
||||
|
||||
func CLIMicroSatRegex() string {
|
||||
return fmt.Sprintf("([acgt]{%d,%d})\\1{%d}", _MinUnitLength, _MaxUnitLength, _MinUnitCount-1)
|
||||
}
|
@ -132,7 +132,7 @@ func FindClosests(sequence *obiseq.BioSequence,
|
||||
lcs, alilength := -1, -1
|
||||
switch maxe {
|
||||
case 0:
|
||||
if obiutils.UnsafeStringFreomBytes(sequence.Sequence()) == obiutils.UnsafeStringFreomBytes(references[order].Sequence()) {
|
||||
if obiutils.UnsafeStringFromBytes(sequence.Sequence()) == obiutils.UnsafeStringFromBytes(references[order].Sequence()) {
|
||||
score = 0
|
||||
alilength = sequence.Len()
|
||||
lcs = alilength
|
||||
@ -279,7 +279,7 @@ func Identify(sequence *obiseq.BioSequence,
|
||||
var bestmatch string
|
||||
var taxon *obitax.TaxNode
|
||||
|
||||
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFreomBytes(sequence.Sequence())]
|
||||
exacttaxon, ok := (*db.ExactTaxid)[obiutils.UnsafeStringFromBytes(sequence.Sequence())]
|
||||
if ok {
|
||||
taxon = exacttaxon.Taxon
|
||||
bestmatch = exacttaxon.Id
|
||||
@ -399,7 +399,7 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
|
||||
|
||||
ft[len(ft)] = taxa[i]
|
||||
|
||||
seqstr := obiutils.UnsafeStringFreomBytes(seq.Sequence())
|
||||
seqstr := obiutils.UnsafeStringFromBytes(seq.Sequence())
|
||||
em, ok := exactmatch[seqstr]
|
||||
|
||||
if !ok {
|
||||
|
@ -2,7 +2,7 @@ package obiutils
|
||||
|
||||
import "unsafe"
|
||||
|
||||
func UnsafeStringFreomBytes(data []byte) string {
|
||||
func UnsafeStringFromBytes(data []byte) string {
|
||||
if len(data) > 0 {
|
||||
s := unsafe.String(unsafe.SliceData(data), len(data))
|
||||
return s
|
||||
|
Reference in New Issue
Block a user