mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
A first functional version of obiclean
This commit is contained in:
22
cmd/obitools/obiclean/main.go
Normal file
22
cmd/obitools/obiclean/main.go
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiclean"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
optionParser := obioptions.GenerateOptionParser(obiclean.OptionSet)
|
||||||
|
|
||||||
|
_, args, _ := optionParser(os.Args)
|
||||||
|
|
||||||
|
fs, _ := obiconvert.ReadBioSequencesBatch(args...)
|
||||||
|
|
||||||
|
cleaned := obiclean.IOBIClean(fs)
|
||||||
|
|
||||||
|
obiconvert.WriteBioSequencesBatch(cleaned, true)
|
||||||
|
}
|
@ -12,6 +12,7 @@ import (
|
|||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
|
||||||
|
// Creating a file called cpu.trace.
|
||||||
ftrace, err := os.Create("cpu.trace")
|
ftrace, err := os.Create("cpu.trace")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatal(err)
|
log.Fatal(err)
|
||||||
|
7
go.mod
7
go.mod
@ -1,6 +1,6 @@
|
|||||||
module git.metabarcoding.org/lecasofts/go/obitools
|
module git.metabarcoding.org/lecasofts/go/obitools
|
||||||
|
|
||||||
go 1.17
|
go 1.18
|
||||||
|
|
||||||
require (
|
require (
|
||||||
github.com/DavidGamba/go-getoptions v0.25.3
|
github.com/DavidGamba/go-getoptions v0.25.3
|
||||||
@ -12,10 +12,15 @@ require (
|
|||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
github.com/akualab/gjoa v0.0.0-20150423185904-0953495dbcc7 // indirect
|
||||||
|
github.com/golang/glog v1.0.0 // indirect
|
||||||
github.com/mattn/go-runewidth v0.0.13 // indirect
|
github.com/mattn/go-runewidth v0.0.13 // indirect
|
||||||
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
|
||||||
github.com/rivo/uniseg v0.2.0 // indirect
|
github.com/rivo/uniseg v0.2.0 // indirect
|
||||||
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 // indirect
|
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 // indirect
|
||||||
|
golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 // indirect
|
||||||
golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a // indirect
|
golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a // indirect
|
||||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect
|
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect
|
||||||
|
gonum.org/v1/gonum v0.11.0 // indirect
|
||||||
|
scientificgo.org/special v0.0.0 // indirect
|
||||||
)
|
)
|
||||||
|
30
go.sum
30
go.sum
@ -1,14 +1,21 @@
|
|||||||
|
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
|
||||||
|
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
|
||||||
github.com/DavidGamba/go-getoptions v0.25.3 h1:lSPcMkwWvVZU05C+Uz4DKnKN5wz4bcD1QvJ/QHCRexo=
|
github.com/DavidGamba/go-getoptions v0.25.3 h1:lSPcMkwWvVZU05C+Uz4DKnKN5wz4bcD1QvJ/QHCRexo=
|
||||||
github.com/DavidGamba/go-getoptions v0.25.3/go.mod h1:qLaLSYeQ8sUVOfKuu5JT5qKKS3OCwyhkYSJnoG+ggmo=
|
github.com/DavidGamba/go-getoptions v0.25.3/go.mod h1:qLaLSYeQ8sUVOfKuu5JT5qKKS3OCwyhkYSJnoG+ggmo=
|
||||||
github.com/PaesslerAG/gval v1.1.2 h1:EROKxV4/fAKWb0Qoj7NOxmHZA7gcpjOV9XgiRZMRCUU=
|
github.com/PaesslerAG/gval v1.1.2 h1:EROKxV4/fAKWb0Qoj7NOxmHZA7gcpjOV9XgiRZMRCUU=
|
||||||
github.com/PaesslerAG/gval v1.1.2/go.mod h1:Fa8gfkCmUsELXgayr8sfL/sw+VzCVoa03dcOcR/if2w=
|
github.com/PaesslerAG/gval v1.1.2/go.mod h1:Fa8gfkCmUsELXgayr8sfL/sw+VzCVoa03dcOcR/if2w=
|
||||||
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
||||||
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
||||||
|
github.com/akualab/gjoa v0.0.0-20150423185904-0953495dbcc7 h1:v8r5rCWpphm5f+yaKDQvVKM46jj0SB7iDUTVS1MHSAg=
|
||||||
|
github.com/akualab/gjoa v0.0.0-20150423185904-0953495dbcc7/go.mod h1:zhxCkmvf40lxgYc6aJuF6AOwfzuvFIANF9OQaMz5xNs=
|
||||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||||
|
github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
|
||||||
github.com/goccy/go-json v0.9.4 h1:L8MLKG2mvVXiQu07qB6hmfqeSYQdOnqPot2GhsIwIaI=
|
github.com/goccy/go-json v0.9.4 h1:L8MLKG2mvVXiQu07qB6hmfqeSYQdOnqPot2GhsIwIaI=
|
||||||
github.com/goccy/go-json v0.9.4/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
|
github.com/goccy/go-json v0.9.4/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
|
||||||
|
github.com/golang/glog v1.0.0 h1:nfP3RFugxnNRyKgeWd4oI1nYvXpxrx8ck8ZrcizshdQ=
|
||||||
|
github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4=
|
||||||
github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
|
github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
|
||||||
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
|
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
|
||||||
github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU=
|
github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU=
|
||||||
@ -29,9 +36,24 @@ github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0
|
|||||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||||
github.com/tevino/abool/v2 v2.0.1 h1:OF7FC5V5z3yAWyixbc32ecEzrgAJCsPkVOsPM2qoZPI=
|
github.com/tevino/abool/v2 v2.0.1 h1:OF7FC5V5z3yAWyixbc32ecEzrgAJCsPkVOsPM2qoZPI=
|
||||||
github.com/tevino/abool/v2 v2.0.1/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY=
|
github.com/tevino/abool/v2 v2.0.1/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY=
|
||||||
|
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||||
|
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||||
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 h1:71vQrMauZZhcTVK6KdYM+rklehEEwb3E+ZhaE5jrPrE=
|
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 h1:71vQrMauZZhcTVK6KdYM+rklehEEwb3E+ZhaE5jrPrE=
|
||||||
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
|
||||||
|
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
|
||||||
|
golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 h1:n9HxLrNxWWtEb1cA950nuEEj3QnKbtsCJ6KjcgisNUs=
|
||||||
|
golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
|
||||||
|
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
|
||||||
|
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
|
||||||
|
golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
|
||||||
|
golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
|
||||||
|
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||||
|
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||||
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
|
||||||
|
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||||
|
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||||
|
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
|
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||||
@ -43,5 +65,13 @@ golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a/go.mod h1:oPkhp1MJrh7nUepCBc
|
|||||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY=
|
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY=
|
||||||
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
|
||||||
|
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||||
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||||
|
golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||||
|
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
|
gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E=
|
||||||
|
gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA=
|
||||||
|
scientificgo.org/special v0.0.0 h1:P6WJkECo6tgtvZAEfNXl+KEB9ReAatjKAeX8U07mjSc=
|
||||||
|
scientificgo.org/special v0.0.0/go.mod h1:LoGVh9tS431RLTJo7gFlYDKFWq44cEb7QqL+M0EKtZU=
|
||||||
|
scientificgo.org/testutil v0.0.0/go.mod h1:Go6R4b+9YkFocMo3H3vNQ7tjbrX9Rc12wal7NZjvPXg=
|
||||||
|
@ -57,6 +57,7 @@ func InterfaceToInt(i interface{}) (val int, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If the interface{} can be cast to an int, return true.
|
||||||
func CastableToInt(i interface{}) bool {
|
func CastableToInt(i interface{}) bool {
|
||||||
|
|
||||||
switch i.(type) {
|
switch i.(type) {
|
||||||
@ -70,7 +71,10 @@ func CastableToInt(i interface{}) bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// CopyMap makes a deep copy of a map[string]interface{}.
|
// "CopyMap copies the contents of a map[string]interface{} to another map[string]interface{}."
|
||||||
|
//
|
||||||
|
// The function uses the gob package to encode the source map into a buffer and then decode the buffer
|
||||||
|
// into the destination map
|
||||||
func CopyMap(dest, src map[string]interface{}) {
|
func CopyMap(dest, src map[string]interface{}) {
|
||||||
buf := new(bytes.Buffer)
|
buf := new(bytes.Buffer)
|
||||||
gob.NewEncoder(buf).Encode(src)
|
gob.NewEncoder(buf).Encode(src)
|
||||||
@ -78,6 +82,7 @@ func CopyMap(dest, src map[string]interface{}) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Read a whole file into the memory and store it as array of lines
|
// Read a whole file into the memory and store it as array of lines
|
||||||
|
// It reads a file line by line, and returns a slice of strings, one for each line
|
||||||
func ReadLines(path string) (lines []string, err error) {
|
func ReadLines(path string) (lines []string, err error) {
|
||||||
var (
|
var (
|
||||||
file *os.File
|
file *os.File
|
||||||
|
78
pkg/obialign/is_d0_or_d1.go
Normal file
78
pkg/obialign/is_d0_or_d1.go
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
package obialign
|
||||||
|
|
||||||
|
import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
|
||||||
|
func abs(x int) int {
|
||||||
|
if x < 0 {
|
||||||
|
return -x
|
||||||
|
}
|
||||||
|
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
|
||||||
|
func D1Or0(seq1, seq2 *obiseq.BioSequence) (int, int, byte, byte) {
|
||||||
|
|
||||||
|
pos := -1
|
||||||
|
|
||||||
|
l1 := seq1.Length()
|
||||||
|
l2 := seq2.Length()
|
||||||
|
|
||||||
|
if abs(l1-l2) > 1 {
|
||||||
|
return -1, pos, 0, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
s1 := seq1.Sequence()
|
||||||
|
s2 := seq2.Sequence()
|
||||||
|
|
||||||
|
b1 := 0
|
||||||
|
b2 := 0
|
||||||
|
|
||||||
|
// Scans the sequences from their beginings as long as they are identical
|
||||||
|
|
||||||
|
for b1 < l1 && b2 < l2 && s1[b1] == s2[b2] {
|
||||||
|
b1++
|
||||||
|
b2++
|
||||||
|
}
|
||||||
|
|
||||||
|
if b1 == l1 && b2 == l2 {
|
||||||
|
return 0, pos, 0, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
// Scans the sequences from their ends as long as they are identical
|
||||||
|
|
||||||
|
e1 := l1 - 1
|
||||||
|
e2 := l2 - 1
|
||||||
|
|
||||||
|
for (e1 > b1 || e2 > b2) && s1[e1] == s2[e2] {
|
||||||
|
e1--
|
||||||
|
e2--
|
||||||
|
}
|
||||||
|
|
||||||
|
if (l1 == l2 && (e1 > b1 || e2 > b2)) ||
|
||||||
|
(l1 > l2 && e1 > b1) ||
|
||||||
|
(l1 < l2 && e2 > b2) {
|
||||||
|
return -1, pos, 0, 0
|
||||||
|
}
|
||||||
|
|
||||||
|
if b1 >= e1 {
|
||||||
|
if e1 > e2 {
|
||||||
|
pos = e1
|
||||||
|
} else {
|
||||||
|
pos = e2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
a1 := byte('-')
|
||||||
|
a2 := byte('-')
|
||||||
|
|
||||||
|
if e2 >= e1 {
|
||||||
|
a2 = s2[e2]
|
||||||
|
}
|
||||||
|
|
||||||
|
if e2 <= e1 {
|
||||||
|
a1 = s1[e1]
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1, pos, a1, a2
|
||||||
|
|
||||||
|
}
|
@ -138,6 +138,7 @@ func OptionMaxLength(maxLength int) WithOption {
|
|||||||
// error allowed when matching the forward
|
// error allowed when matching the forward
|
||||||
// primer.
|
// primer.
|
||||||
func OptionForwardPrimer(primer string, max int) WithOption {
|
func OptionForwardPrimer(primer string, max int) WithOption {
|
||||||
|
|
||||||
f := WithOption(func(opt Options) {
|
f := WithOption(func(opt Options) {
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
|
@ -210,17 +210,22 @@ func ParseOBIFeatures(text string, annotations obiseq.Annotation) string {
|
|||||||
bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)])
|
bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)])
|
||||||
j := bytes.ReplaceAll(bvalue, []byte("'"), []byte(`"`))
|
j := bytes.ReplaceAll(bvalue, []byte("'"), []byte(`"`))
|
||||||
var err error
|
var err error
|
||||||
if strings.HasPrefix(key, "merged_") ||
|
switch {
|
||||||
strings.HasSuffix(key, "_count") {
|
case strings.HasPrefix(key, "merged_") ||
|
||||||
|
strings.HasSuffix(key, "_count"):
|
||||||
dict := make(map[string]int)
|
dict := make(map[string]int)
|
||||||
err = json.Unmarshal(j, &dict)
|
err = json.Unmarshal(j, &dict)
|
||||||
value = dict
|
value = dict
|
||||||
} else {
|
case strings.HasSuffix(key, "_status"):
|
||||||
|
dict := make(map[string]string)
|
||||||
|
err = json.Unmarshal(j, &dict)
|
||||||
|
value = dict
|
||||||
|
default:
|
||||||
dict := make(map[string]interface{})
|
dict := make(map[string]interface{})
|
||||||
err = json.Unmarshal(j, &dict)
|
err = json.Unmarshal(j, &dict)
|
||||||
value = dict
|
value = dict
|
||||||
}
|
}
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
value = string(bvalue)
|
value = string(bvalue)
|
||||||
}
|
}
|
||||||
|
@ -494,6 +494,9 @@ func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch,
|
|||||||
return newIter
|
return newIter
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A function that takes a predicate and returns two IBioSequenceBatch iterators.
|
||||||
|
// Sequences extracted from the input iterator are distributed among both the
|
||||||
|
// iterator following the predicate value.
|
||||||
func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate,
|
func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate,
|
||||||
size int, sizes ...int) (IBioSequenceBatch, IBioSequenceBatch) {
|
size int, sizes ...int) (IBioSequenceBatch, IBioSequenceBatch) {
|
||||||
buffsize := iterator.BufferSize()
|
buffsize := iterator.BufferSize()
|
||||||
@ -560,6 +563,8 @@ func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate,
|
|||||||
return trueIter, falseIter
|
return trueIter, falseIter
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Filtering a batch of sequences.
|
||||||
|
// A function that takes a predicate and a batch of sequences and returns a filtered batch of sequences.
|
||||||
func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate,
|
func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate,
|
||||||
size int, sizes ...int) IBioSequenceBatch {
|
size int, sizes ...int) IBioSequenceBatch {
|
||||||
buffsize := iterator.BufferSize()
|
buffsize := iterator.BufferSize()
|
||||||
@ -615,6 +620,8 @@ func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate,
|
|||||||
return trueIter.Rebatch(size)
|
return trueIter.Rebatch(size)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Load every sequences availables from an IBioSequenceBatch iterator into
|
||||||
|
// a large obiseq.BioSequenceSlice.
|
||||||
func (iterator IBioSequenceBatch) Load() obiseq.BioSequenceSlice {
|
func (iterator IBioSequenceBatch) Load() obiseq.BioSequenceSlice {
|
||||||
|
|
||||||
chunck := obiseq.MakeBioSequenceSlice()
|
chunck := obiseq.MakeBioSequenceSlice()
|
||||||
@ -625,4 +632,42 @@ func (iterator IBioSequenceBatch) Load() obiseq.BioSequenceSlice {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return chunck
|
return chunck
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// It takes a slice of BioSequence objects, and returns an iterator that will return batches of
|
||||||
|
// BioSequence objects
|
||||||
|
func IBatchOver(data obiseq.BioSequenceSlice,
|
||||||
|
size int, sizes ...int) IBioSequenceBatch {
|
||||||
|
|
||||||
|
buffsize := 0
|
||||||
|
|
||||||
|
if len(sizes) > 0 {
|
||||||
|
buffsize = sizes[1]
|
||||||
|
}
|
||||||
|
|
||||||
|
trueIter := MakeIBioSequenceBatch(buffsize)
|
||||||
|
|
||||||
|
trueIter.Add(1)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
trueIter.WaitAndClose()
|
||||||
|
}()
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
ldata := len(data)
|
||||||
|
batchid := 0
|
||||||
|
next := 0
|
||||||
|
for i:=0; i < ldata; i=next {
|
||||||
|
next = i + size
|
||||||
|
if next > ldata {
|
||||||
|
next = ldata
|
||||||
|
}
|
||||||
|
trueIter.Push(MakeBioSequenceBatch(batchid,data[i:next]))
|
||||||
|
batchid++
|
||||||
|
}
|
||||||
|
|
||||||
|
trueIter.Done()
|
||||||
|
}()
|
||||||
|
|
||||||
|
return trueIter
|
||||||
|
}
|
||||||
|
@ -43,6 +43,9 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
|
|||||||
case StatsOnValues:
|
case StatsOnValues:
|
||||||
stats = istat
|
stats = istat
|
||||||
newstat = false
|
newstat = false
|
||||||
|
case map[string]int:
|
||||||
|
stats = istat
|
||||||
|
newstat = false
|
||||||
case map[string]interface{}:
|
case map[string]interface{}:
|
||||||
stats = make(StatsOnValues, len(istat))
|
stats = make(StatsOnValues, len(istat))
|
||||||
newstat = false
|
newstat = false
|
||||||
|
27
pkg/obistats/minmax.go
Normal file
27
pkg/obistats/minmax.go
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
package obistats
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
func Max[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T {
|
||||||
|
m := data[0]
|
||||||
|
|
||||||
|
for _,v := range data {
|
||||||
|
if v > m {
|
||||||
|
m = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return m
|
||||||
|
}
|
||||||
|
|
||||||
|
func Min[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T {
|
||||||
|
m := data[0]
|
||||||
|
|
||||||
|
for _,v := range data {
|
||||||
|
if v < m {
|
||||||
|
m = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return m
|
||||||
|
}
|
49
pkg/obistats/utils.go
Normal file
49
pkg/obistats/utils.go
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
package obistats
|
||||||
|
|
||||||
|
import (
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
"math"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Lchoose returns logarithms of binomial coefficients
|
||||||
|
func Lchoose(n,x int) float64 {
|
||||||
|
fn := float64(n)
|
||||||
|
fx := float64(x)
|
||||||
|
|
||||||
|
ln1, _ := math.Lgamma(fn + 1.0)
|
||||||
|
lx1, _ := math.Lgamma(fx + 1.0)
|
||||||
|
lnx1, _ := math.Lgamma(fn - fx + 1.0)
|
||||||
|
|
||||||
|
return ln1 - lx1 - lnx1
|
||||||
|
}
|
||||||
|
|
||||||
|
func Choose(n,x int) float64 {
|
||||||
|
return math.Exp(Lchoose(x,n))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
func LogAddExp(x, y float64) float64 {
|
||||||
|
|
||||||
|
tmp := x - y
|
||||||
|
|
||||||
|
if tmp > 0 {
|
||||||
|
|
||||||
|
return x + math.Log1p(math.Exp(-tmp))
|
||||||
|
|
||||||
|
} else if tmp <= 0 {
|
||||||
|
|
||||||
|
return y + math.Log1p(math.Exp(tmp))
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// Nans, or infinities of the same sign involved
|
||||||
|
|
||||||
|
log.Errorf("logaddexp %f %f", x, y)
|
||||||
|
|
||||||
|
return x + y
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
448
pkg/obitools/obiclean/graph.go
Normal file
448
pkg/obitools/obiclean/graph.go
Normal file
@ -0,0 +1,448 @@
|
|||||||
|
package obiclean
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bytes"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"path"
|
||||||
|
"sort"
|
||||||
|
"sync"
|
||||||
|
"text/template"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
|
||||||
|
"github.com/schollz/progressbar/v3"
|
||||||
|
)
|
||||||
|
|
||||||
|
type Ratio struct {
|
||||||
|
From int
|
||||||
|
To int
|
||||||
|
Pos int
|
||||||
|
Length int
|
||||||
|
}
|
||||||
|
|
||||||
|
func abs(x int) int {
|
||||||
|
if x < 0 {
|
||||||
|
return -x
|
||||||
|
}
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
|
||||||
|
func max(x, y int) int {
|
||||||
|
if x > y {
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
return y
|
||||||
|
}
|
||||||
|
|
||||||
|
func min(x, y int) int {
|
||||||
|
if x < y {
|
||||||
|
return x
|
||||||
|
}
|
||||||
|
return y
|
||||||
|
}
|
||||||
|
|
||||||
|
func minMax(x, y int) (int, int) {
|
||||||
|
if x < y {
|
||||||
|
return x, y
|
||||||
|
}
|
||||||
|
return y, x
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// It takes a filename and a 2D slice of floats pruduced during graph building,
|
||||||
|
// and writes a CSV file with the first column being the
|
||||||
|
// first nucleotide, the second column being the second nucleotide, and the third column being the
|
||||||
|
// ratio
|
||||||
|
func EmpiricalDistCsv(filename string, data [][]Ratio) {
|
||||||
|
file, err := os.Create(filename)
|
||||||
|
if err != nil {
|
||||||
|
fmt.Println(err)
|
||||||
|
}
|
||||||
|
defer file.Close()
|
||||||
|
|
||||||
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
|
pbopt = append(pbopt,
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Save CSV stat ratio file"),
|
||||||
|
)
|
||||||
|
|
||||||
|
bar := progressbar.NewOptions(len(data), pbopt...)
|
||||||
|
|
||||||
|
fmt.Fprintln(file, "From,To,Count_from,Count_to,Position,length")
|
||||||
|
for code, dist := range data {
|
||||||
|
a1, a2 := intToNucPair(code)
|
||||||
|
for _, ratio := range dist {
|
||||||
|
fmt.Fprintf(file, "%c,%c,%d,%d,%d,%d\n", a1, a2, ratio.From, ratio.To, ratio.Pos, ratio.Length)
|
||||||
|
}
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// It takes a slice of sequences, a sample name and a statistical threshold and returns a string
|
||||||
|
// containing a GML representation of the graph
|
||||||
|
func Gml(seqs *[]*seqPCR, sample string, statThreshold int) string {
|
||||||
|
// (*seqs)[1].Count
|
||||||
|
var dot bytes.Buffer
|
||||||
|
digraphTpl := template.New("gml_digraph")
|
||||||
|
digraph := `graph [
|
||||||
|
comment "Obiclean graph for sample {{ Name }}"
|
||||||
|
directed 1
|
||||||
|
{{range $index, $data:= .}}
|
||||||
|
{{ if or $data.Fathers $data.HasSon}}
|
||||||
|
node [ id {{$index}}
|
||||||
|
graphics [
|
||||||
|
type "{{ Shape $data.Count }}"
|
||||||
|
fill "{{ if and $data.HasSon (not $data.Fathers)}}#0000FF{{ else }}#00FF00{{ end }}"
|
||||||
|
h {{ Sqrt $data.Count }}
|
||||||
|
w {{ Sqrt $data.Count }}
|
||||||
|
]
|
||||||
|
|
||||||
|
]
|
||||||
|
{{ end }}
|
||||||
|
{{ end }}
|
||||||
|
|
||||||
|
{{range $index, $data:= .}}
|
||||||
|
{{range $i, $father:= $data.Fathers}}
|
||||||
|
edge [ source {{$index}}
|
||||||
|
target {{$father}}
|
||||||
|
color "{{ if gt (index $data.Dist $i) 1 }}#FF0000{{ else }}#00FF00{{ end }}"
|
||||||
|
label "{{(index $data.Dist $i)}}"
|
||||||
|
]
|
||||||
|
{{ end }}
|
||||||
|
{{ end }}
|
||||||
|
]
|
||||||
|
|
||||||
|
`
|
||||||
|
|
||||||
|
tmpl, err := digraphTpl.Funcs(template.FuncMap{
|
||||||
|
"Sqrt": func(i int) int { return 3 * int(math.Floor(math.Sqrt(float64(i)))) },
|
||||||
|
"Name": func() string { return sample },
|
||||||
|
"Shape": func(i int) string {
|
||||||
|
if i >= statThreshold {
|
||||||
|
return "circle"
|
||||||
|
} else {
|
||||||
|
return "rectangle"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}).Parse(digraph)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
err = tmpl.Execute(&dot, *seqs)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
panic(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return dot.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func SaveGMLGraphs(dirname string,
|
||||||
|
samples map[string]*[]*seqPCR,
|
||||||
|
statThreshold int,
|
||||||
|
) {
|
||||||
|
|
||||||
|
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
|
||||||
|
// path does not exist or is not directory
|
||||||
|
os.RemoveAll(dirname)
|
||||||
|
err := os.Mkdir(dirname, 0755)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Panicf("Cannot create directory %s for saving graphs", dirname)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
|
pbopt = append(pbopt,
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Save GML Graph files]"),
|
||||||
|
)
|
||||||
|
|
||||||
|
bar := progressbar.NewOptions(len(samples), pbopt...)
|
||||||
|
|
||||||
|
for name, seqs := range samples {
|
||||||
|
|
||||||
|
file, err := os.Create(path.Join(dirname,
|
||||||
|
fmt.Sprintf("%s.gml", name)))
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
fmt.Println(err)
|
||||||
|
}
|
||||||
|
|
||||||
|
file.WriteString(Gml(seqs, name, statThreshold))
|
||||||
|
file.Close()
|
||||||
|
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func nucPair(a, b byte) int {
|
||||||
|
|
||||||
|
n1 := 0
|
||||||
|
switch a {
|
||||||
|
case 'a':
|
||||||
|
n1 = 1
|
||||||
|
case 'c':
|
||||||
|
n1 = 2
|
||||||
|
case 'g':
|
||||||
|
n1 = 3
|
||||||
|
case 't':
|
||||||
|
n1 = 4
|
||||||
|
}
|
||||||
|
|
||||||
|
n2 := 0
|
||||||
|
switch b {
|
||||||
|
case 'a':
|
||||||
|
n2 = 1
|
||||||
|
case 'c':
|
||||||
|
n2 = 2
|
||||||
|
case 'g':
|
||||||
|
n2 = 3
|
||||||
|
case 't':
|
||||||
|
n2 = 4
|
||||||
|
}
|
||||||
|
|
||||||
|
return n1*5 + n2
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func intToNucPair(code int) (a, b byte) {
|
||||||
|
var decode = []byte{'-', 'a', 'c', 'g', 't'}
|
||||||
|
c1 := code / 5
|
||||||
|
c2 := code - c1*5
|
||||||
|
|
||||||
|
return decode[c1], decode[c2]
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildSamplePairs(seqs *[]*seqPCR, minStatRatio int, workers int) ([][]Ratio, int) {
|
||||||
|
nseq := len(*seqs)
|
||||||
|
running := sync.WaitGroup{}
|
||||||
|
|
||||||
|
linePairs := func(i int) [][]Ratio {
|
||||||
|
|
||||||
|
ratio := make([][]Ratio, 25)
|
||||||
|
son := (*seqs)[i]
|
||||||
|
|
||||||
|
for j := i + 1; j < nseq; j++ {
|
||||||
|
father := (*seqs)[j]
|
||||||
|
d, pos, a1, a2 := obialign.D1Or0(son.Sequence, father.Sequence)
|
||||||
|
if d > 0 {
|
||||||
|
son.Fathers = append(son.Fathers, j)
|
||||||
|
son.Dist = append(son.Dist, d)
|
||||||
|
father.SonCount++
|
||||||
|
|
||||||
|
if father.Count > minStatRatio {
|
||||||
|
n := nucPair(a1, a2)
|
||||||
|
ratio[n] = append(ratio[n], Ratio{father.Count, son.Count, pos, father.Sequence.Length()})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ratio
|
||||||
|
}
|
||||||
|
|
||||||
|
lineChan := make(chan int)
|
||||||
|
idxChan := make(chan [][]Ratio)
|
||||||
|
|
||||||
|
ff := func() {
|
||||||
|
for i := range lineChan {
|
||||||
|
idxChan <- linePairs(i)
|
||||||
|
}
|
||||||
|
|
||||||
|
running.Done()
|
||||||
|
}
|
||||||
|
|
||||||
|
running.Add(workers)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
running.Wait()
|
||||||
|
close(idxChan)
|
||||||
|
}()
|
||||||
|
|
||||||
|
for i := 0; i < workers; i++ {
|
||||||
|
go ff()
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
for i := 0; i < nseq; i++ {
|
||||||
|
lineChan <- i
|
||||||
|
}
|
||||||
|
close(lineChan)
|
||||||
|
}()
|
||||||
|
|
||||||
|
np := nseq * (nseq - 1) / 2
|
||||||
|
ratio := make([][]Ratio, 25)
|
||||||
|
for data := range idxChan {
|
||||||
|
for i, r := range data {
|
||||||
|
ratio[i] = append(ratio[i], r...)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return ratio, np
|
||||||
|
}
|
||||||
|
|
||||||
|
func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int {
|
||||||
|
nseq := len(*seqs)
|
||||||
|
running := sync.WaitGroup{}
|
||||||
|
|
||||||
|
linePairs := func(matrix *obialign.LCSMatrix, i int) {
|
||||||
|
son := (*seqs)[i]
|
||||||
|
for j := i + 1; j < nseq; j++ {
|
||||||
|
father := (*seqs)[j]
|
||||||
|
d, _, _, _ := obialign.D1Or0(son.Sequence, father.Sequence)
|
||||||
|
|
||||||
|
if d < 0 {
|
||||||
|
lcs, lali := obialign.LCSScore(son.Sequence, father.Sequence,
|
||||||
|
step,
|
||||||
|
matrix)
|
||||||
|
d := (lali - lcs)
|
||||||
|
if lcs >= 0 && d <= step && step > 0 {
|
||||||
|
son.Fathers = append(son.Fathers, j)
|
||||||
|
son.Dist = append(son.Dist, d)
|
||||||
|
father.SonCount++
|
||||||
|
//a, b := minMax((*seqs)[i].Count, (*seqs)[j].Count)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
lineChan := make(chan int)
|
||||||
|
// idxChan := make(chan [][]Ratio)
|
||||||
|
|
||||||
|
ff := func() {
|
||||||
|
matrix := obialign.NewLCSMatrix(nil, 150, 150, step)
|
||||||
|
for i := range lineChan {
|
||||||
|
linePairs(matrix, i)
|
||||||
|
}
|
||||||
|
|
||||||
|
running.Done()
|
||||||
|
}
|
||||||
|
|
||||||
|
running.Add(workers)
|
||||||
|
|
||||||
|
for i := 0; i < workers; i++ {
|
||||||
|
go ff()
|
||||||
|
}
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
for i := 0; i < nseq; i++ {
|
||||||
|
if len((*seqs)[i].Fathers) == 0 {
|
||||||
|
lineChan <- i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(lineChan)
|
||||||
|
}()
|
||||||
|
|
||||||
|
running.Wait()
|
||||||
|
np := nseq * (nseq - 1) / 2
|
||||||
|
return np
|
||||||
|
}
|
||||||
|
|
||||||
|
func FilterGraphOnRatio(seqs *[]*seqPCR, ratio float64) {
|
||||||
|
for _, s1 := range *seqs {
|
||||||
|
c1 := float64(s1.Count)
|
||||||
|
f := s1.Fathers
|
||||||
|
d := s1.Dist
|
||||||
|
j := 0
|
||||||
|
for i, s2 := range f {
|
||||||
|
f[j] = f[i]
|
||||||
|
d[j] = d[i]
|
||||||
|
if (c1 / float64((*seqs)[s2].Count)) <= math.Pow(ratio, float64(d[i])) {
|
||||||
|
j++
|
||||||
|
} else {
|
||||||
|
(*seqs)[s2].SonCount--
|
||||||
|
}
|
||||||
|
}
|
||||||
|
s1.Fathers = f[0:j]
|
||||||
|
s1.Dist = d[0:j]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sortSamples sorts the sequences in each sample by their increasing count
|
||||||
|
func sortSamples(samples map[string]*([]*seqPCR)) {
|
||||||
|
|
||||||
|
for _, s := range samples {
|
||||||
|
sort.SliceStable(*s, func(i, j int) bool {
|
||||||
|
return (*s)[i].Count < (*s)[j].Count
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func ObicleanStatus(seq *seqPCR) string {
|
||||||
|
if len(seq.Fathers) == 0 {
|
||||||
|
if seq.SonCount > 0 {
|
||||||
|
return "h"
|
||||||
|
} else {
|
||||||
|
return "s"
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return "i"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func BuildSeqGraph(samples map[string]*[]*seqPCR,
|
||||||
|
maxError, minStatRatio, workers int) [][]Ratio {
|
||||||
|
|
||||||
|
sortSamples(samples)
|
||||||
|
|
||||||
|
npairs := 0
|
||||||
|
for _, seqs := range samples {
|
||||||
|
nseq := len(*seqs)
|
||||||
|
npairs += nseq * (nseq - 1) / 2
|
||||||
|
}
|
||||||
|
|
||||||
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
|
pbopt = append(pbopt,
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[One error graph]"),
|
||||||
|
)
|
||||||
|
|
||||||
|
bar := progressbar.NewOptions(npairs, pbopt...)
|
||||||
|
all_ratio := make([][]Ratio, 25)
|
||||||
|
for _, seqs := range samples {
|
||||||
|
ratio, np := buildSamplePairs(seqs, minStatRatio, workers)
|
||||||
|
|
||||||
|
for i, r := range ratio {
|
||||||
|
all_ratio[i] = append(all_ratio[i], r...)
|
||||||
|
}
|
||||||
|
|
||||||
|
bar.Add(np)
|
||||||
|
}
|
||||||
|
|
||||||
|
if maxError > 1 {
|
||||||
|
pbopt = make([]progressbar.Option, 0, 5)
|
||||||
|
pbopt = append(pbopt,
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Adds multiple errors]"),
|
||||||
|
)
|
||||||
|
|
||||||
|
bar = progressbar.NewOptions(npairs, pbopt...)
|
||||||
|
|
||||||
|
for _, seqs := range samples {
|
||||||
|
np := extendSimilarityGraph(seqs, maxError, workers)
|
||||||
|
bar.Add(np)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return all_ratio
|
||||||
|
}
|
251
pkg/obitools/obiclean/obiclean.go
Normal file
251
pkg/obitools/obiclean/obiclean.go
Normal file
@ -0,0 +1,251 @@
|
|||||||
|
package obiclean
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
"github.com/schollz/progressbar/v3"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
type seqPCR struct {
|
||||||
|
Count int // number of reads associated to a sequence in a PCR
|
||||||
|
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
|
||||||
|
SonCount int
|
||||||
|
Fathers []int
|
||||||
|
Dist []int
|
||||||
|
}
|
||||||
|
|
||||||
|
// buildSamples sorts the sequences by samples
|
||||||
|
//
|
||||||
|
// The sequences are distributed according to their sample association.
|
||||||
|
// The function returns a map indexed by sample names.
|
||||||
|
// Each sample is represented by a vector where each element associates
|
||||||
|
// a sequence to its count of occurrences in the considered sample.
|
||||||
|
func buildSamples(dataset obiseq.BioSequenceSlice,
|
||||||
|
tag, NAValue string) map[string]*([]*seqPCR) {
|
||||||
|
samples := make(map[string]*([]*seqPCR))
|
||||||
|
|
||||||
|
for _, s := range dataset {
|
||||||
|
stats := s.StatsOn(tag, NAValue)
|
||||||
|
|
||||||
|
for k, v := range stats {
|
||||||
|
pcr, ok := samples[k]
|
||||||
|
|
||||||
|
if !ok {
|
||||||
|
p := make([]*seqPCR, 0, 10)
|
||||||
|
pcr = &p
|
||||||
|
samples[k] = pcr
|
||||||
|
}
|
||||||
|
|
||||||
|
*pcr = append(*pcr, &seqPCR{
|
||||||
|
Count: v,
|
||||||
|
Sequence: s,
|
||||||
|
SonCount: 0,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return samples
|
||||||
|
}
|
||||||
|
|
||||||
|
func annotateOBIClean(dataset obiseq.BioSequenceSlice,
|
||||||
|
sample map[string]*([]*seqPCR),
|
||||||
|
tag, NAValue string) obiiter.IBioSequenceBatch {
|
||||||
|
batchsize := 1000
|
||||||
|
var annot = func(data obiseq.BioSequenceSlice) obiseq.BioSequenceSlice {
|
||||||
|
|
||||||
|
for _, s := range data {
|
||||||
|
status := Status(s)
|
||||||
|
head := 0
|
||||||
|
internal := 0
|
||||||
|
singleton := 0
|
||||||
|
for _, v := range status {
|
||||||
|
switch v {
|
||||||
|
case "i":
|
||||||
|
internal++
|
||||||
|
case "h":
|
||||||
|
head++
|
||||||
|
case "s":
|
||||||
|
singleton++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
is_head := (head + singleton) > 0
|
||||||
|
|
||||||
|
annotation := s.Annotations()
|
||||||
|
annotation["obiclean_head"] = is_head
|
||||||
|
annotation["obiclean_singletoncount"] = singleton
|
||||||
|
annotation["obiclean_internalcount"] = internal
|
||||||
|
annotation["obiclean_headcount"] = head
|
||||||
|
annotation["obiclean_samplecount"] = head + internal + singleton
|
||||||
|
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
}
|
||||||
|
|
||||||
|
iter := obiiter.IBatchOver(dataset, batchsize)
|
||||||
|
riter := iter.MakeISliceWorker(annot)
|
||||||
|
|
||||||
|
return riter
|
||||||
|
}
|
||||||
|
|
||||||
|
func IsHead(sequence *obiseq.BioSequence) bool {
|
||||||
|
annotation := sequence.Annotations()
|
||||||
|
iishead, ok := annotation["obiclean_head"]
|
||||||
|
ishead := true
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
switch iishead := iishead.(type) {
|
||||||
|
case bool:
|
||||||
|
ishead = iishead
|
||||||
|
default:
|
||||||
|
log.Panic("obiclean_head attribute of sequence %s must be a boolean not : %v", sequence.Id(), iishead)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ishead
|
||||||
|
}
|
||||||
|
|
||||||
|
func HeadCount(sequence *obiseq.BioSequence) int {
|
||||||
|
var err error
|
||||||
|
annotation := sequence.Annotations()
|
||||||
|
ivalue, ok := annotation["obiclean_headcount"]
|
||||||
|
value := 0
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
value, err = goutils.InterfaceToInt(value)
|
||||||
|
if err != nil {
|
||||||
|
log.Panic("obiclean_headcount attribute of sequence %s must be an integer value not : %v", sequence.Id(), ivalue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
func InternalCount(sequence *obiseq.BioSequence) int {
|
||||||
|
var err error
|
||||||
|
annotation := sequence.Annotations()
|
||||||
|
ivalue, ok := annotation["obiclean_internalcount"]
|
||||||
|
value := 0
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
value, err = goutils.InterfaceToInt(value)
|
||||||
|
if err != nil {
|
||||||
|
log.Panic("obiclean_internalcount attribute of sequence %s must be an integer value not : %v", sequence.Id(), ivalue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
func SingletonCount(sequence *obiseq.BioSequence) int {
|
||||||
|
var err error
|
||||||
|
annotation := sequence.Annotations()
|
||||||
|
ivalue, ok := annotation["obiclean_samplecount"]
|
||||||
|
value := 0
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
value, err = goutils.InterfaceToInt(value)
|
||||||
|
if err != nil {
|
||||||
|
log.Panic("obiclean_samplecount attribute of sequence %s must be an integer value not : %v", sequence.Id(), ivalue)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return value
|
||||||
|
}
|
||||||
|
|
||||||
|
func Status(sequence *obiseq.BioSequence) map[string]string {
|
||||||
|
annotation := sequence.Annotations()
|
||||||
|
iobistatus, ok := annotation["obiclean_status"]
|
||||||
|
var obistatus map[string]string
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
switch iobistatus := iobistatus.(type) {
|
||||||
|
case map[string]string:
|
||||||
|
obistatus = iobistatus
|
||||||
|
case map[string]interface{}:
|
||||||
|
obistatus = make(map[string]string)
|
||||||
|
for k, v := range iobistatus {
|
||||||
|
obistatus[k] = fmt.Sprint(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
obistatus = make(map[string]string)
|
||||||
|
annotation["obiclean_status"] = obistatus
|
||||||
|
}
|
||||||
|
|
||||||
|
return obistatus
|
||||||
|
}
|
||||||
|
|
||||||
|
func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
||||||
|
|
||||||
|
db := itertator.Load()
|
||||||
|
|
||||||
|
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
|
||||||
|
|
||||||
|
samples := buildSamples(db, SampleAttribute(), "NA")
|
||||||
|
|
||||||
|
log.Infof("Dataset composed of %d samples\n", len(samples))
|
||||||
|
|
||||||
|
all_ratio := BuildSeqGraph(samples,
|
||||||
|
DistStepMax(),
|
||||||
|
MinCountToEvalMutationRate(),
|
||||||
|
obioptions.CLIParallelWorkers())
|
||||||
|
|
||||||
|
if RatioMax() < 1.0 {
|
||||||
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
|
pbopt = append(pbopt,
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Filter graph on abundance ratio]"),
|
||||||
|
)
|
||||||
|
|
||||||
|
bar := progressbar.NewOptions(len(samples), pbopt...)
|
||||||
|
|
||||||
|
for _, seqs := range samples {
|
||||||
|
FilterGraphOnRatio(seqs, RatioMax())
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if IsSaveRatioTable() {
|
||||||
|
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
|
||||||
|
}
|
||||||
|
|
||||||
|
if SaveGraphToFiles() {
|
||||||
|
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
|
||||||
|
}
|
||||||
|
|
||||||
|
pbopt := make([]progressbar.Option, 0, 5)
|
||||||
|
pbopt = append(pbopt,
|
||||||
|
progressbar.OptionSetWriter(os.Stderr),
|
||||||
|
progressbar.OptionSetWidth(15),
|
||||||
|
progressbar.OptionShowIts(),
|
||||||
|
progressbar.OptionSetPredictTime(true),
|
||||||
|
progressbar.OptionSetDescription("[Annotate sequence status]"),
|
||||||
|
)
|
||||||
|
|
||||||
|
bar := progressbar.NewOptions(len(samples), pbopt...)
|
||||||
|
|
||||||
|
for name, seqs := range samples {
|
||||||
|
for _, pcr := range *seqs {
|
||||||
|
obistatus := Status(pcr.Sequence)
|
||||||
|
obistatus[name] = ObicleanStatus(pcr)
|
||||||
|
}
|
||||||
|
bar.Add(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
iter := annotateOBIClean(db, samples, SampleAttribute(), "NA")
|
||||||
|
|
||||||
|
if OnlyHead() {
|
||||||
|
iter = iter.FilterOn(IsHead, 1000)
|
||||||
|
}
|
||||||
|
|
||||||
|
return iter
|
||||||
|
}
|
113
pkg/obitools/obiclean/options.go
Normal file
113
pkg/obitools/obiclean/options.go
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
package obiclean
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
|
"github.com/DavidGamba/go-getoptions"
|
||||||
|
)
|
||||||
|
|
||||||
|
var _distStepMax = 1
|
||||||
|
var _sampleAttribute = "sample"
|
||||||
|
|
||||||
|
var _ratioMax = 1.0
|
||||||
|
var _minEvalRate = 1000
|
||||||
|
|
||||||
|
var _clusterMode = false
|
||||||
|
var _onlyHead = false
|
||||||
|
|
||||||
|
var _saveGraph = "__@@NOSAVE@@__"
|
||||||
|
var _saveRatio = "__@@NOSAVE@@__"
|
||||||
|
|
||||||
|
func ObicleanOptionSet(options *getoptions.GetOpt) {
|
||||||
|
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
|
||||||
|
options.Alias("s"),
|
||||||
|
options.Description("Attribute containing sample descriptions (default %s)."))
|
||||||
|
|
||||||
|
options.IntVar(&_distStepMax, "distance", _distStepMax,
|
||||||
|
options.Alias("d"),
|
||||||
|
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
|
||||||
|
|
||||||
|
options.Float64Var(&_ratioMax, "ratio", _ratioMax,
|
||||||
|
options.Alias("r"),
|
||||||
|
options.Description("Threshold ratio between counts (rare/abundant counts)"+
|
||||||
|
" of two sequence records so that the less abundant one is a variant of "+
|
||||||
|
"the more abundant (default: %3.2f)."))
|
||||||
|
|
||||||
|
options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate,
|
||||||
|
options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate."))
|
||||||
|
|
||||||
|
// options.BoolVar(&_clusterMode, "cluster", _clusterMode,
|
||||||
|
// options.Alias("C"),
|
||||||
|
// options.Description("Switch obiclean into its clustering mode. This adds information to each sequence about the true."),
|
||||||
|
// )
|
||||||
|
|
||||||
|
options.BoolVar(&_onlyHead, "head", _onlyHead,
|
||||||
|
options.Alias("H"),
|
||||||
|
options.Description("Select only sequences with the head status in a least one sample."),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
|
||||||
|
options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
|
||||||
|
"The graph files follow the graphml format."),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.StringVar(&_saveRatio, "save-ratio", _saveRatio,
|
||||||
|
options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+
|
||||||
|
"The ratio file follows the csv format."),
|
||||||
|
)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
|
obiconvert.InputOptionSet(options)
|
||||||
|
obiconvert.OutputOptionSet(options)
|
||||||
|
ObicleanOptionSet(options)
|
||||||
|
}
|
||||||
|
|
||||||
|
func DistStepMax() int {
|
||||||
|
return _distStepMax
|
||||||
|
}
|
||||||
|
|
||||||
|
// It returns the name of the attibute used to store sample name
|
||||||
|
func SampleAttribute() string {
|
||||||
|
return _sampleAttribute
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the maximum abundance ratio between read counts to consider sequences as error.
|
||||||
|
func RatioMax() float64 {
|
||||||
|
return _ratioMax
|
||||||
|
}
|
||||||
|
|
||||||
|
// > The function `MinCountToEvalMutationRate()` returns the minimum number of reads that must be
|
||||||
|
// observed before the mutation rate can be evaluated
|
||||||
|
func MinCountToEvalMutationRate() int {
|
||||||
|
return _minEvalRate
|
||||||
|
}
|
||||||
|
|
||||||
|
func ClusterMode() bool {
|
||||||
|
return _clusterMode
|
||||||
|
}
|
||||||
|
|
||||||
|
// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program
|
||||||
|
func OnlyHead() bool {
|
||||||
|
return _onlyHead
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns true it the obliclean graphs must be saved
|
||||||
|
func SaveGraphToFiles() bool {
|
||||||
|
return _saveGraph != "__@@NOSAVE@@__"
|
||||||
|
}
|
||||||
|
|
||||||
|
// It returns the directory where the graph files are saved
|
||||||
|
func GraphFilesDirectory() string {
|
||||||
|
return _saveGraph
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns true it the table of ratio must be saved
|
||||||
|
func IsSaveRatioTable() bool {
|
||||||
|
return _saveRatio != "__@@NOSAVE@@__"
|
||||||
|
}
|
||||||
|
|
||||||
|
// It returns the filename of the file that stores the ratio table
|
||||||
|
func RatioTableFilename() string {
|
||||||
|
return _saveRatio
|
||||||
|
}
|
@ -60,7 +60,7 @@ func OptionSet(options *getoptions.GetOpt) {
|
|||||||
// --forward command line option
|
// --forward command line option
|
||||||
func ForwardPrimer() string {
|
func ForwardPrimer() string {
|
||||||
pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch)
|
pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("%+v", err)
|
log.Fatalf("%+v", err)
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user