diff --git a/cmd/obitools/obiclean/main.go b/cmd/obitools/obiclean/main.go new file mode 100644 index 0000000..24b3ace --- /dev/null +++ b/cmd/obitools/obiclean/main.go @@ -0,0 +1,22 @@ +package main + +import ( + "os" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiclean" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" +) + +func main() { + optionParser := obioptions.GenerateOptionParser(obiclean.OptionSet) + + _, args, _ := optionParser(os.Args) + + fs, _ := obiconvert.ReadBioSequencesBatch(args...) + + cleaned := obiclean.IOBIClean(fs) + + obiconvert.WriteBioSequencesBatch(cleaned, true) +} diff --git a/cmd/test/main.go b/cmd/test/main.go index 97a9e9c..59500cb 100644 --- a/cmd/test/main.go +++ b/cmd/test/main.go @@ -12,6 +12,7 @@ import ( func main() { + // Creating a file called cpu.trace. ftrace, err := os.Create("cpu.trace") if err != nil { log.Fatal(err) diff --git a/go.mod b/go.mod index fcb0eaa..215f57f 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module git.metabarcoding.org/lecasofts/go/obitools -go 1.17 +go 1.18 require ( github.com/DavidGamba/go-getoptions v0.25.3 @@ -12,10 +12,15 @@ require ( ) require ( + github.com/akualab/gjoa v0.0.0-20150423185904-0953495dbcc7 // indirect + github.com/golang/glog v1.0.0 // indirect github.com/mattn/go-runewidth v0.0.13 // indirect github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect github.com/rivo/uniseg v0.2.0 // indirect golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 // indirect + golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 // indirect golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a // indirect golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect + gonum.org/v1/gonum v0.11.0 // indirect + scientificgo.org/special v0.0.0 // indirect ) diff --git a/go.sum b/go.sum index 380b392..3e38654 100644 --- a/go.sum +++ b/go.sum @@ -1,14 +1,21 @@ +dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= +github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/DavidGamba/go-getoptions v0.25.3 h1:lSPcMkwWvVZU05C+Uz4DKnKN5wz4bcD1QvJ/QHCRexo= github.com/DavidGamba/go-getoptions v0.25.3/go.mod h1:qLaLSYeQ8sUVOfKuu5JT5qKKS3OCwyhkYSJnoG+ggmo= github.com/PaesslerAG/gval v1.1.2 h1:EROKxV4/fAKWb0Qoj7NOxmHZA7gcpjOV9XgiRZMRCUU= github.com/PaesslerAG/gval v1.1.2/go.mod h1:Fa8gfkCmUsELXgayr8sfL/sw+VzCVoa03dcOcR/if2w= github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI= github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8= +github.com/akualab/gjoa v0.0.0-20150423185904-0953495dbcc7 h1:v8r5rCWpphm5f+yaKDQvVKM46jj0SB7iDUTVS1MHSAg= +github.com/akualab/gjoa v0.0.0-20150423185904-0953495dbcc7/go.mod h1:zhxCkmvf40lxgYc6aJuF6AOwfzuvFIANF9OQaMz5xNs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/goccy/go-json v0.9.4 h1:L8MLKG2mvVXiQu07qB6hmfqeSYQdOnqPot2GhsIwIaI= github.com/goccy/go-json v0.9.4/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I= +github.com/golang/glog v1.0.0 h1:nfP3RFugxnNRyKgeWd4oI1nYvXpxrx8ck8ZrcizshdQ= +github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4= github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU= @@ -29,9 +36,24 @@ github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/tevino/abool/v2 v2.0.1 h1:OF7FC5V5z3yAWyixbc32ecEzrgAJCsPkVOsPM2qoZPI= github.com/tevino/abool/v2 v2.0.1/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 h1:71vQrMauZZhcTVK6KdYM+rklehEEwb3E+ZhaE5jrPrE= golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 h1:n9HxLrNxWWtEb1cA950nuEEj3QnKbtsCJ6KjcgisNUs= +golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE= +golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= +golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= +golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= +golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -43,5 +65,13 @@ golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E= +gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA= +scientificgo.org/special v0.0.0 h1:P6WJkECo6tgtvZAEfNXl+KEB9ReAatjKAeX8U07mjSc= +scientificgo.org/special v0.0.0/go.mod h1:LoGVh9tS431RLTJo7gFlYDKFWq44cEb7QqL+M0EKtZU= +scientificgo.org/testutil v0.0.0/go.mod h1:Go6R4b+9YkFocMo3H3vNQ7tjbrX9Rc12wal7NZjvPXg= diff --git a/pkg/goutils/goutils.go b/pkg/goutils/goutils.go index 097b70d..c6133ec 100644 --- a/pkg/goutils/goutils.go +++ b/pkg/goutils/goutils.go @@ -57,6 +57,7 @@ func InterfaceToInt(i interface{}) (val int, err error) { return } +// If the interface{} can be cast to an int, return true. func CastableToInt(i interface{}) bool { switch i.(type) { @@ -70,7 +71,10 @@ func CastableToInt(i interface{}) bool { } } -// CopyMap makes a deep copy of a map[string]interface{}. +// "CopyMap copies the contents of a map[string]interface{} to another map[string]interface{}." +// +// The function uses the gob package to encode the source map into a buffer and then decode the buffer +// into the destination map func CopyMap(dest, src map[string]interface{}) { buf := new(bytes.Buffer) gob.NewEncoder(buf).Encode(src) @@ -78,6 +82,7 @@ func CopyMap(dest, src map[string]interface{}) { } // Read a whole file into the memory and store it as array of lines +// It reads a file line by line, and returns a slice of strings, one for each line func ReadLines(path string) (lines []string, err error) { var ( file *os.File diff --git a/pkg/obialign/is_d0_or_d1.go b/pkg/obialign/is_d0_or_d1.go new file mode 100644 index 0000000..0ecfcde --- /dev/null +++ b/pkg/obialign/is_d0_or_d1.go @@ -0,0 +1,78 @@ +package obialign + +import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + +func abs(x int) int { + if x < 0 { + return -x + } + + return x +} + +func D1Or0(seq1, seq2 *obiseq.BioSequence) (int, int, byte, byte) { + + pos := -1 + + l1 := seq1.Length() + l2 := seq2.Length() + + if abs(l1-l2) > 1 { + return -1, pos, 0, 0 + } + + s1 := seq1.Sequence() + s2 := seq2.Sequence() + + b1 := 0 + b2 := 0 + + // Scans the sequences from their beginings as long as they are identical + + for b1 < l1 && b2 < l2 && s1[b1] == s2[b2] { + b1++ + b2++ + } + + if b1 == l1 && b2 == l2 { + return 0, pos, 0, 0 + } + + // Scans the sequences from their ends as long as they are identical + + e1 := l1 - 1 + e2 := l2 - 1 + + for (e1 > b1 || e2 > b2) && s1[e1] == s2[e2] { + e1-- + e2-- + } + + if (l1 == l2 && (e1 > b1 || e2 > b2)) || + (l1 > l2 && e1 > b1) || + (l1 < l2 && e2 > b2) { + return -1, pos, 0, 0 + } + + if b1 >= e1 { + if e1 > e2 { + pos = e1 + } else { + pos = e2 + } + } + + a1 := byte('-') + a2 := byte('-') + + if e2 >= e1 { + a2 = s2[e2] + } + + if e2 <= e1 { + a1 = s1[e1] + } + + return 1, pos, a1, a2 + +} diff --git a/pkg/obiapat/pcr.go b/pkg/obiapat/pcr.go index c8c85a0..7ec0350 100644 --- a/pkg/obiapat/pcr.go +++ b/pkg/obiapat/pcr.go @@ -138,6 +138,7 @@ func OptionMaxLength(maxLength int) WithOption { // error allowed when matching the forward // primer. func OptionForwardPrimer(primer string, max int) WithOption { + f := WithOption(func(opt Options) { var err error diff --git a/pkg/obiformats/fastseq_obi_header.go b/pkg/obiformats/fastseq_obi_header.go index 72b627c..1710bbf 100644 --- a/pkg/obiformats/fastseq_obi_header.go +++ b/pkg/obiformats/fastseq_obi_header.go @@ -210,17 +210,22 @@ func ParseOBIFeatures(text string, annotations obiseq.Annotation) string { bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)]) j := bytes.ReplaceAll(bvalue, []byte("'"), []byte(`"`)) var err error - if strings.HasPrefix(key, "merged_") || - strings.HasSuffix(key, "_count") { + switch { + case strings.HasPrefix(key, "merged_") || + strings.HasSuffix(key, "_count"): dict := make(map[string]int) err = json.Unmarshal(j, &dict) value = dict - } else { + case strings.HasSuffix(key, "_status"): + dict := make(map[string]string) + err = json.Unmarshal(j, &dict) + value = dict + default: dict := make(map[string]interface{}) err = json.Unmarshal(j, &dict) value = dict } - + if err != nil { value = string(bvalue) } diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index ed7eff1..4c735c1 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -494,6 +494,9 @@ func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch, return newIter } +// A function that takes a predicate and returns two IBioSequenceBatch iterators. +// Sequences extracted from the input iterator are distributed among both the +// iterator following the predicate value. func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate, size int, sizes ...int) (IBioSequenceBatch, IBioSequenceBatch) { buffsize := iterator.BufferSize() @@ -560,6 +563,8 @@ func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate, return trueIter, falseIter } +// Filtering a batch of sequences. +// A function that takes a predicate and a batch of sequences and returns a filtered batch of sequences. func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate, size int, sizes ...int) IBioSequenceBatch { buffsize := iterator.BufferSize() @@ -615,6 +620,8 @@ func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate, return trueIter.Rebatch(size) } +// Load every sequences availables from an IBioSequenceBatch iterator into +// a large obiseq.BioSequenceSlice. func (iterator IBioSequenceBatch) Load() obiseq.BioSequenceSlice { chunck := obiseq.MakeBioSequenceSlice() @@ -625,4 +632,42 @@ func (iterator IBioSequenceBatch) Load() obiseq.BioSequenceSlice { } return chunck -} \ No newline at end of file +} + +// It takes a slice of BioSequence objects, and returns an iterator that will return batches of +// BioSequence objects +func IBatchOver(data obiseq.BioSequenceSlice, + size int, sizes ...int) IBioSequenceBatch { + + buffsize := 0 + + if len(sizes) > 0 { + buffsize = sizes[1] + } + + trueIter := MakeIBioSequenceBatch(buffsize) + + trueIter.Add(1) + + go func() { + trueIter.WaitAndClose() + }() + + go func() { + ldata := len(data) + batchid := 0 + next := 0 + for i:=0; i < ldata; i=next { + next = i + size + if next > ldata { + next = ldata + } + trueIter.Push(MakeBioSequenceBatch(batchid,data[i:next])) + batchid++ + } + + trueIter.Done() + }() + + return trueIter +} diff --git a/pkg/obiseq/merge.go b/pkg/obiseq/merge.go index 69ce16b..ff17067 100644 --- a/pkg/obiseq/merge.go +++ b/pkg/obiseq/merge.go @@ -43,6 +43,9 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues { case StatsOnValues: stats = istat newstat = false + case map[string]int: + stats = istat + newstat = false case map[string]interface{}: stats = make(StatsOnValues, len(istat)) newstat = false diff --git a/pkg/obistats/minmax.go b/pkg/obistats/minmax.go new file mode 100644 index 0000000..a88318f --- /dev/null +++ b/pkg/obistats/minmax.go @@ -0,0 +1,27 @@ +package obistats + + + +func Max[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T { + m := data[0] + + for _,v := range data { + if v > m { + m = v + } + } + + return m +} + +func Min[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T { + m := data[0] + + for _,v := range data { + if v < m { + m = v + } + } + + return m +} \ No newline at end of file diff --git a/pkg/obistats/utils.go b/pkg/obistats/utils.go new file mode 100644 index 0000000..65206d6 --- /dev/null +++ b/pkg/obistats/utils.go @@ -0,0 +1,49 @@ +package obistats + +import ( + log "github.com/sirupsen/logrus" + "math" +) + +// Lchoose returns logarithms of binomial coefficients +func Lchoose(n,x int) float64 { + fn := float64(n) + fx := float64(x) + + ln1, _ := math.Lgamma(fn + 1.0) + lx1, _ := math.Lgamma(fx + 1.0) + lnx1, _ := math.Lgamma(fn - fx + 1.0) + + return ln1 - lx1 - lnx1 +} + +func Choose(n,x int) float64 { + return math.Exp(Lchoose(x,n)) +} + + +func LogAddExp(x, y float64) float64 { + + tmp := x - y + + if tmp > 0 { + + return x + math.Log1p(math.Exp(-tmp)) + + } else if tmp <= 0 { + + return y + math.Log1p(math.Exp(tmp)) + + } else { + + // Nans, or infinities of the same sign involved + + log.Errorf("logaddexp %f %f", x, y) + + return x + y + + } + + } + + \ No newline at end of file diff --git a/pkg/obitools/obiclean/graph.go b/pkg/obitools/obiclean/graph.go new file mode 100644 index 0000000..b0ada8f --- /dev/null +++ b/pkg/obitools/obiclean/graph.go @@ -0,0 +1,448 @@ +package obiclean + +import ( + "bytes" + "fmt" + "log" + "math" + "os" + "path" + "sort" + "sync" + "text/template" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign" + "github.com/schollz/progressbar/v3" +) + +type Ratio struct { + From int + To int + Pos int + Length int +} + +func abs(x int) int { + if x < 0 { + return -x + } + return x +} + +func max(x, y int) int { + if x > y { + return x + } + return y +} + +func min(x, y int) int { + if x < y { + return x + } + return y +} + +func minMax(x, y int) (int, int) { + if x < y { + return x, y + } + return y, x + +} + +// It takes a filename and a 2D slice of floats pruduced during graph building, +// and writes a CSV file with the first column being the +// first nucleotide, the second column being the second nucleotide, and the third column being the +// ratio +func EmpiricalDistCsv(filename string, data [][]Ratio) { + file, err := os.Create(filename) + if err != nil { + fmt.Println(err) + } + defer file.Close() + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[Save CSV stat ratio file"), + ) + + bar := progressbar.NewOptions(len(data), pbopt...) + + fmt.Fprintln(file, "From,To,Count_from,Count_to,Position,length") + for code, dist := range data { + a1, a2 := intToNucPair(code) + for _, ratio := range dist { + fmt.Fprintf(file, "%c,%c,%d,%d,%d,%d\n", a1, a2, ratio.From, ratio.To, ratio.Pos, ratio.Length) + } + bar.Add(1) + } +} + +// It takes a slice of sequences, a sample name and a statistical threshold and returns a string +// containing a GML representation of the graph +func Gml(seqs *[]*seqPCR, sample string, statThreshold int) string { + // (*seqs)[1].Count + var dot bytes.Buffer + digraphTpl := template.New("gml_digraph") + digraph := `graph [ + comment "Obiclean graph for sample {{ Name }}" + directed 1 + {{range $index, $data:= .}} + {{ if or $data.Fathers $data.HasSon}} + node [ id {{$index}} + graphics [ + type "{{ Shape $data.Count }}" + fill "{{ if and $data.HasSon (not $data.Fathers)}}#0000FF{{ else }}#00FF00{{ end }}" + h {{ Sqrt $data.Count }} + w {{ Sqrt $data.Count }} + ] + + ] + {{ end }} + {{ end }} + + {{range $index, $data:= .}} + {{range $i, $father:= $data.Fathers}} + edge [ source {{$index}} + target {{$father}} + color "{{ if gt (index $data.Dist $i) 1 }}#FF0000{{ else }}#00FF00{{ end }}" + label "{{(index $data.Dist $i)}}" + ] + {{ end }} + {{ end }} + ] + + ` + + tmpl, err := digraphTpl.Funcs(template.FuncMap{ + "Sqrt": func(i int) int { return 3 * int(math.Floor(math.Sqrt(float64(i)))) }, + "Name": func() string { return sample }, + "Shape": func(i int) string { + if i >= statThreshold { + return "circle" + } else { + return "rectangle" + } + }, + }).Parse(digraph) + + if err != nil { + panic(err) + } + + err = tmpl.Execute(&dot, *seqs) + + if err != nil { + panic(err) + } + + return dot.String() +} + +func SaveGMLGraphs(dirname string, + samples map[string]*[]*seqPCR, + statThreshold int, +) { + + if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() { + // path does not exist or is not directory + os.RemoveAll(dirname) + err := os.Mkdir(dirname, 0755) + + if err != nil { + log.Panicf("Cannot create directory %s for saving graphs", dirname) + } + } + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[Save GML Graph files]"), + ) + + bar := progressbar.NewOptions(len(samples), pbopt...) + + for name, seqs := range samples { + + file, err := os.Create(path.Join(dirname, + fmt.Sprintf("%s.gml", name))) + + if err != nil { + fmt.Println(err) + } + + file.WriteString(Gml(seqs, name, statThreshold)) + file.Close() + + bar.Add(1) + } + +} + +func nucPair(a, b byte) int { + + n1 := 0 + switch a { + case 'a': + n1 = 1 + case 'c': + n1 = 2 + case 'g': + n1 = 3 + case 't': + n1 = 4 + } + + n2 := 0 + switch b { + case 'a': + n2 = 1 + case 'c': + n2 = 2 + case 'g': + n2 = 3 + case 't': + n2 = 4 + } + + return n1*5 + n2 + +} + +func intToNucPair(code int) (a, b byte) { + var decode = []byte{'-', 'a', 'c', 'g', 't'} + c1 := code / 5 + c2 := code - c1*5 + + return decode[c1], decode[c2] +} + +func buildSamplePairs(seqs *[]*seqPCR, minStatRatio int, workers int) ([][]Ratio, int) { + nseq := len(*seqs) + running := sync.WaitGroup{} + + linePairs := func(i int) [][]Ratio { + + ratio := make([][]Ratio, 25) + son := (*seqs)[i] + + for j := i + 1; j < nseq; j++ { + father := (*seqs)[j] + d, pos, a1, a2 := obialign.D1Or0(son.Sequence, father.Sequence) + if d > 0 { + son.Fathers = append(son.Fathers, j) + son.Dist = append(son.Dist, d) + father.SonCount++ + + if father.Count > minStatRatio { + n := nucPair(a1, a2) + ratio[n] = append(ratio[n], Ratio{father.Count, son.Count, pos, father.Sequence.Length()}) + } + } + } + + return ratio + } + + lineChan := make(chan int) + idxChan := make(chan [][]Ratio) + + ff := func() { + for i := range lineChan { + idxChan <- linePairs(i) + } + + running.Done() + } + + running.Add(workers) + + go func() { + running.Wait() + close(idxChan) + }() + + for i := 0; i < workers; i++ { + go ff() + } + + go func() { + for i := 0; i < nseq; i++ { + lineChan <- i + } + close(lineChan) + }() + + np := nseq * (nseq - 1) / 2 + ratio := make([][]Ratio, 25) + for data := range idxChan { + for i, r := range data { + ratio[i] = append(ratio[i], r...) + } + + } + + return ratio, np +} + +func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int { + nseq := len(*seqs) + running := sync.WaitGroup{} + + linePairs := func(matrix *obialign.LCSMatrix, i int) { + son := (*seqs)[i] + for j := i + 1; j < nseq; j++ { + father := (*seqs)[j] + d, _, _, _ := obialign.D1Or0(son.Sequence, father.Sequence) + + if d < 0 { + lcs, lali := obialign.LCSScore(son.Sequence, father.Sequence, + step, + matrix) + d := (lali - lcs) + if lcs >= 0 && d <= step && step > 0 { + son.Fathers = append(son.Fathers, j) + son.Dist = append(son.Dist, d) + father.SonCount++ + //a, b := minMax((*seqs)[i].Count, (*seqs)[j].Count) + } + } + + } + } + + lineChan := make(chan int) + // idxChan := make(chan [][]Ratio) + + ff := func() { + matrix := obialign.NewLCSMatrix(nil, 150, 150, step) + for i := range lineChan { + linePairs(matrix, i) + } + + running.Done() + } + + running.Add(workers) + + for i := 0; i < workers; i++ { + go ff() + } + + go func() { + for i := 0; i < nseq; i++ { + if len((*seqs)[i].Fathers) == 0 { + lineChan <- i + } + } + close(lineChan) + }() + + running.Wait() + np := nseq * (nseq - 1) / 2 + return np +} + +func FilterGraphOnRatio(seqs *[]*seqPCR, ratio float64) { + for _, s1 := range *seqs { + c1 := float64(s1.Count) + f := s1.Fathers + d := s1.Dist + j := 0 + for i, s2 := range f { + f[j] = f[i] + d[j] = d[i] + if (c1 / float64((*seqs)[s2].Count)) <= math.Pow(ratio, float64(d[i])) { + j++ + } else { + (*seqs)[s2].SonCount-- + } + } + s1.Fathers = f[0:j] + s1.Dist = d[0:j] + } +} + +// sortSamples sorts the sequences in each sample by their increasing count +func sortSamples(samples map[string]*([]*seqPCR)) { + + for _, s := range samples { + sort.SliceStable(*s, func(i, j int) bool { + return (*s)[i].Count < (*s)[j].Count + }) + } + +} + +func ObicleanStatus(seq *seqPCR) string { + if len(seq.Fathers) == 0 { + if seq.SonCount > 0 { + return "h" + } else { + return "s" + } + } else { + return "i" + } +} + +func BuildSeqGraph(samples map[string]*[]*seqPCR, + maxError, minStatRatio, workers int) [][]Ratio { + + sortSamples(samples) + + npairs := 0 + for _, seqs := range samples { + nseq := len(*seqs) + npairs += nseq * (nseq - 1) / 2 + } + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[One error graph]"), + ) + + bar := progressbar.NewOptions(npairs, pbopt...) + all_ratio := make([][]Ratio, 25) + for _, seqs := range samples { + ratio, np := buildSamplePairs(seqs, minStatRatio, workers) + + for i, r := range ratio { + all_ratio[i] = append(all_ratio[i], r...) + } + + bar.Add(np) + } + + if maxError > 1 { + pbopt = make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[Adds multiple errors]"), + ) + + bar = progressbar.NewOptions(npairs, pbopt...) + + for _, seqs := range samples { + np := extendSimilarityGraph(seqs, maxError, workers) + bar.Add(np) + } + } + + return all_ratio +} diff --git a/pkg/obitools/obiclean/obiclean.go b/pkg/obitools/obiclean/obiclean.go new file mode 100644 index 0000000..b25a075 --- /dev/null +++ b/pkg/obitools/obiclean/obiclean.go @@ -0,0 +1,251 @@ +package obiclean + +import ( + "fmt" + "os" + + "git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions" + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq" + "github.com/schollz/progressbar/v3" + log "github.com/sirupsen/logrus" +) + +type seqPCR struct { + Count int // number of reads associated to a sequence in a PCR + Sequence *obiseq.BioSequence // pointer to the corresponding sequence + SonCount int + Fathers []int + Dist []int +} + +// buildSamples sorts the sequences by samples +// +// The sequences are distributed according to their sample association. +// The function returns a map indexed by sample names. +// Each sample is represented by a vector where each element associates +// a sequence to its count of occurrences in the considered sample. +func buildSamples(dataset obiseq.BioSequenceSlice, + tag, NAValue string) map[string]*([]*seqPCR) { + samples := make(map[string]*([]*seqPCR)) + + for _, s := range dataset { + stats := s.StatsOn(tag, NAValue) + + for k, v := range stats { + pcr, ok := samples[k] + + if !ok { + p := make([]*seqPCR, 0, 10) + pcr = &p + samples[k] = pcr + } + + *pcr = append(*pcr, &seqPCR{ + Count: v, + Sequence: s, + SonCount: 0, + }) + } + } + + return samples +} + +func annotateOBIClean(dataset obiseq.BioSequenceSlice, + sample map[string]*([]*seqPCR), + tag, NAValue string) obiiter.IBioSequenceBatch { + batchsize := 1000 + var annot = func(data obiseq.BioSequenceSlice) obiseq.BioSequenceSlice { + + for _, s := range data { + status := Status(s) + head := 0 + internal := 0 + singleton := 0 + for _, v := range status { + switch v { + case "i": + internal++ + case "h": + head++ + case "s": + singleton++ + } + } + is_head := (head + singleton) > 0 + + annotation := s.Annotations() + annotation["obiclean_head"] = is_head + annotation["obiclean_singletoncount"] = singleton + annotation["obiclean_internalcount"] = internal + annotation["obiclean_headcount"] = head + annotation["obiclean_samplecount"] = head + internal + singleton + + } + return data + } + + iter := obiiter.IBatchOver(dataset, batchsize) + riter := iter.MakeISliceWorker(annot) + + return riter +} + +func IsHead(sequence *obiseq.BioSequence) bool { + annotation := sequence.Annotations() + iishead, ok := annotation["obiclean_head"] + ishead := true + + if ok { + switch iishead := iishead.(type) { + case bool: + ishead = iishead + default: + log.Panic("obiclean_head attribute of sequence %s must be a boolean not : %v", sequence.Id(), iishead) + } + } + + return ishead +} + +func HeadCount(sequence *obiseq.BioSequence) int { + var err error + annotation := sequence.Annotations() + ivalue, ok := annotation["obiclean_headcount"] + value := 0 + + if ok { + value, err = goutils.InterfaceToInt(value) + if err != nil { + log.Panic("obiclean_headcount attribute of sequence %s must be an integer value not : %v", sequence.Id(), ivalue) + } + } + + return value +} + +func InternalCount(sequence *obiseq.BioSequence) int { + var err error + annotation := sequence.Annotations() + ivalue, ok := annotation["obiclean_internalcount"] + value := 0 + + if ok { + value, err = goutils.InterfaceToInt(value) + if err != nil { + log.Panic("obiclean_internalcount attribute of sequence %s must be an integer value not : %v", sequence.Id(), ivalue) + } + } + + return value +} + +func SingletonCount(sequence *obiseq.BioSequence) int { + var err error + annotation := sequence.Annotations() + ivalue, ok := annotation["obiclean_samplecount"] + value := 0 + + if ok { + value, err = goutils.InterfaceToInt(value) + if err != nil { + log.Panic("obiclean_samplecount attribute of sequence %s must be an integer value not : %v", sequence.Id(), ivalue) + } + } + + return value +} + +func Status(sequence *obiseq.BioSequence) map[string]string { + annotation := sequence.Annotations() + iobistatus, ok := annotation["obiclean_status"] + var obistatus map[string]string + + if ok { + switch iobistatus := iobistatus.(type) { + case map[string]string: + obistatus = iobistatus + case map[string]interface{}: + obistatus = make(map[string]string) + for k, v := range iobistatus { + obistatus[k] = fmt.Sprint(v) + } + } + } else { + obistatus = make(map[string]string) + annotation["obiclean_status"] = obistatus + } + + return obistatus +} + +func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { + + db := itertator.Load() + + log.Infof("Sequence dataset of %d sequeences loaded\n", len(db)) + + samples := buildSamples(db, SampleAttribute(), "NA") + + log.Infof("Dataset composed of %d samples\n", len(samples)) + + all_ratio := BuildSeqGraph(samples, + DistStepMax(), + MinCountToEvalMutationRate(), + obioptions.CLIParallelWorkers()) + + if RatioMax() < 1.0 { + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[Filter graph on abundance ratio]"), + ) + + bar := progressbar.NewOptions(len(samples), pbopt...) + + for _, seqs := range samples { + FilterGraphOnRatio(seqs, RatioMax()) + bar.Add(1) + } + } + + if IsSaveRatioTable() { + EmpiricalDistCsv(RatioTableFilename(), all_ratio) + } + + if SaveGraphToFiles() { + SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate()) + } + + pbopt := make([]progressbar.Option, 0, 5) + pbopt = append(pbopt, + progressbar.OptionSetWriter(os.Stderr), + progressbar.OptionSetWidth(15), + progressbar.OptionShowIts(), + progressbar.OptionSetPredictTime(true), + progressbar.OptionSetDescription("[Annotate sequence status]"), + ) + + bar := progressbar.NewOptions(len(samples), pbopt...) + + for name, seqs := range samples { + for _, pcr := range *seqs { + obistatus := Status(pcr.Sequence) + obistatus[name] = ObicleanStatus(pcr) + } + bar.Add(1) + } + + iter := annotateOBIClean(db, samples, SampleAttribute(), "NA") + + if OnlyHead() { + iter = iter.FilterOn(IsHead, 1000) + } + + return iter +} diff --git a/pkg/obitools/obiclean/options.go b/pkg/obitools/obiclean/options.go new file mode 100644 index 0000000..a98afee --- /dev/null +++ b/pkg/obitools/obiclean/options.go @@ -0,0 +1,113 @@ +package obiclean + +import ( + "git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _distStepMax = 1 +var _sampleAttribute = "sample" + +var _ratioMax = 1.0 +var _minEvalRate = 1000 + +var _clusterMode = false +var _onlyHead = false + +var _saveGraph = "__@@NOSAVE@@__" +var _saveRatio = "__@@NOSAVE@@__" + +func ObicleanOptionSet(options *getoptions.GetOpt) { + options.StringVar(&_sampleAttribute, "sample", _sampleAttribute, + options.Alias("s"), + options.Description("Attribute containing sample descriptions (default %s).")) + + options.IntVar(&_distStepMax, "distance", _distStepMax, + options.Alias("d"), + options.Description("Maximum numbers of differences between two variant sequences (default: %d).")) + + options.Float64Var(&_ratioMax, "ratio", _ratioMax, + options.Alias("r"), + options.Description("Threshold ratio between counts (rare/abundant counts)"+ + " of two sequence records so that the less abundant one is a variant of "+ + "the more abundant (default: %3.2f).")) + + options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate, + options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate.")) + + // options.BoolVar(&_clusterMode, "cluster", _clusterMode, + // options.Alias("C"), + // options.Description("Switch obiclean into its clustering mode. This adds information to each sequence about the true."), + // ) + + options.BoolVar(&_onlyHead, "head", _onlyHead, + options.Alias("H"), + options.Description("Select only sequences with the head status in a least one sample."), + ) + + options.StringVar(&_saveGraph, "save-graph", _saveGraph, + options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+ + "The graph files follow the graphml format."), + ) + + options.StringVar(&_saveRatio, "save-ratio", _saveRatio, + options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+ + "The ratio file follows the csv format."), + ) + +} + +func OptionSet(options *getoptions.GetOpt) { + obiconvert.InputOptionSet(options) + obiconvert.OutputOptionSet(options) + ObicleanOptionSet(options) +} + +func DistStepMax() int { + return _distStepMax +} + +// It returns the name of the attibute used to store sample name +func SampleAttribute() string { + return _sampleAttribute +} + +// Return the maximum abundance ratio between read counts to consider sequences as error. +func RatioMax() float64 { + return _ratioMax +} + +// > The function `MinCountToEvalMutationRate()` returns the minimum number of reads that must be +// observed before the mutation rate can be evaluated +func MinCountToEvalMutationRate() int { + return _minEvalRate +} + +func ClusterMode() bool { + return _clusterMode +} + +// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program +func OnlyHead() bool { + return _onlyHead +} + +// Returns true it the obliclean graphs must be saved +func SaveGraphToFiles() bool { + return _saveGraph != "__@@NOSAVE@@__" +} + +// It returns the directory where the graph files are saved +func GraphFilesDirectory() string { + return _saveGraph +} + +// Returns true it the table of ratio must be saved +func IsSaveRatioTable() bool { + return _saveRatio != "__@@NOSAVE@@__" +} + +// It returns the filename of the file that stores the ratio table +func RatioTableFilename() string { + return _saveRatio +} diff --git a/pkg/obitools/obipcr/options.go b/pkg/obitools/obipcr/options.go index a7f888a..f6d60ea 100644 --- a/pkg/obitools/obipcr/options.go +++ b/pkg/obitools/obipcr/options.go @@ -60,7 +60,7 @@ func OptionSet(options *getoptions.GetOpt) { // --forward command line option func ForwardPrimer() string { pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch) - + if err != nil { log.Fatalf("%+v", err) }