A first functional version of obiclean

This commit is contained in:
2022-08-20 18:01:07 +02:00
parent a07d348aea
commit 5dd835d3e7
16 changed files with 1091 additions and 8 deletions

View File

@ -0,0 +1,22 @@
package main
import (
"os"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiclean"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
)
func main() {
optionParser := obioptions.GenerateOptionParser(obiclean.OptionSet)
_, args, _ := optionParser(os.Args)
fs, _ := obiconvert.ReadBioSequencesBatch(args...)
cleaned := obiclean.IOBIClean(fs)
obiconvert.WriteBioSequencesBatch(cleaned, true)
}

View File

@ -12,6 +12,7 @@ import (
func main() {
// Creating a file called cpu.trace.
ftrace, err := os.Create("cpu.trace")
if err != nil {
log.Fatal(err)

7
go.mod
View File

@ -1,6 +1,6 @@
module git.metabarcoding.org/lecasofts/go/obitools
go 1.17
go 1.18
require (
github.com/DavidGamba/go-getoptions v0.25.3
@ -12,10 +12,15 @@ require (
)
require (
github.com/akualab/gjoa v0.0.0-20150423185904-0953495dbcc7 // indirect
github.com/golang/glog v1.0.0 // indirect
github.com/mattn/go-runewidth v0.0.13 // indirect
github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db // indirect
github.com/rivo/uniseg v0.2.0 // indirect
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 // indirect
golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 // indirect
golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a // indirect
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect
gonum.org/v1/gonum v0.11.0 // indirect
scientificgo.org/special v0.0.0 // indirect
)

30
go.sum
View File

@ -1,14 +1,21 @@
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/DavidGamba/go-getoptions v0.25.3 h1:lSPcMkwWvVZU05C+Uz4DKnKN5wz4bcD1QvJ/QHCRexo=
github.com/DavidGamba/go-getoptions v0.25.3/go.mod h1:qLaLSYeQ8sUVOfKuu5JT5qKKS3OCwyhkYSJnoG+ggmo=
github.com/PaesslerAG/gval v1.1.2 h1:EROKxV4/fAKWb0Qoj7NOxmHZA7gcpjOV9XgiRZMRCUU=
github.com/PaesslerAG/gval v1.1.2/go.mod h1:Fa8gfkCmUsELXgayr8sfL/sw+VzCVoa03dcOcR/if2w=
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
github.com/akualab/gjoa v0.0.0-20150423185904-0953495dbcc7 h1:v8r5rCWpphm5f+yaKDQvVKM46jj0SB7iDUTVS1MHSAg=
github.com/akualab/gjoa v0.0.0-20150423185904-0953495dbcc7/go.mod h1:zhxCkmvf40lxgYc6aJuF6AOwfzuvFIANF9OQaMz5xNs=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU=
github.com/goccy/go-json v0.9.4 h1:L8MLKG2mvVXiQu07qB6hmfqeSYQdOnqPot2GhsIwIaI=
github.com/goccy/go-json v0.9.4/go.mod h1:6MelG93GURQebXPDq3khkgXZkazVtN9CRI+MGFi0w8I=
github.com/golang/glog v1.0.0 h1:nfP3RFugxnNRyKgeWd4oI1nYvXpxrx8ck8ZrcizshdQ=
github.com/golang/glog v1.0.0/go.mod h1:EWib/APOK0SL3dFbYqvxE3UYd8E6s1ouQ7iEp/0LWV4=
github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94=
github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU=
@ -29,9 +36,24 @@ github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/tevino/abool/v2 v2.0.1 h1:OF7FC5V5z3yAWyixbc32ecEzrgAJCsPkVOsPM2qoZPI=
github.com/tevino/abool/v2 v2.0.1/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838 h1:71vQrMauZZhcTVK6KdYM+rklehEEwb3E+ZhaE5jrPrE=
golang.org/x/crypto v0.0.0-20220131195533-30dcbda58838/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3 h1:n9HxLrNxWWtEb1cA950nuEEj3QnKbtsCJ6KjcgisNUs=
golang.org/x/exp v0.0.0-20191002040644-a1355ae1e2c3/go.mod h1:NOZ3BPKG0ec/BKJQgnvsSFpcKLM5xXVWnvZS97DWHgE=
golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0=
golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@ -43,5 +65,13 @@ golang.org/x/sys v0.0.0-20220204135822-1c1b9b1eba6a/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190927191325-030b2cf1153e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E=
gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA=
scientificgo.org/special v0.0.0 h1:P6WJkECo6tgtvZAEfNXl+KEB9ReAatjKAeX8U07mjSc=
scientificgo.org/special v0.0.0/go.mod h1:LoGVh9tS431RLTJo7gFlYDKFWq44cEb7QqL+M0EKtZU=
scientificgo.org/testutil v0.0.0/go.mod h1:Go6R4b+9YkFocMo3H3vNQ7tjbrX9Rc12wal7NZjvPXg=

View File

@ -57,6 +57,7 @@ func InterfaceToInt(i interface{}) (val int, err error) {
return
}
// If the interface{} can be cast to an int, return true.
func CastableToInt(i interface{}) bool {
switch i.(type) {
@ -70,7 +71,10 @@ func CastableToInt(i interface{}) bool {
}
}
// CopyMap makes a deep copy of a map[string]interface{}.
// "CopyMap copies the contents of a map[string]interface{} to another map[string]interface{}."
//
// The function uses the gob package to encode the source map into a buffer and then decode the buffer
// into the destination map
func CopyMap(dest, src map[string]interface{}) {
buf := new(bytes.Buffer)
gob.NewEncoder(buf).Encode(src)
@ -78,6 +82,7 @@ func CopyMap(dest, src map[string]interface{}) {
}
// Read a whole file into the memory and store it as array of lines
// It reads a file line by line, and returns a slice of strings, one for each line
func ReadLines(path string) (lines []string, err error) {
var (
file *os.File

View File

@ -0,0 +1,78 @@
package obialign
import "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
func abs(x int) int {
if x < 0 {
return -x
}
return x
}
func D1Or0(seq1, seq2 *obiseq.BioSequence) (int, int, byte, byte) {
pos := -1
l1 := seq1.Length()
l2 := seq2.Length()
if abs(l1-l2) > 1 {
return -1, pos, 0, 0
}
s1 := seq1.Sequence()
s2 := seq2.Sequence()
b1 := 0
b2 := 0
// Scans the sequences from their beginings as long as they are identical
for b1 < l1 && b2 < l2 && s1[b1] == s2[b2] {
b1++
b2++
}
if b1 == l1 && b2 == l2 {
return 0, pos, 0, 0
}
// Scans the sequences from their ends as long as they are identical
e1 := l1 - 1
e2 := l2 - 1
for (e1 > b1 || e2 > b2) && s1[e1] == s2[e2] {
e1--
e2--
}
if (l1 == l2 && (e1 > b1 || e2 > b2)) ||
(l1 > l2 && e1 > b1) ||
(l1 < l2 && e2 > b2) {
return -1, pos, 0, 0
}
if b1 >= e1 {
if e1 > e2 {
pos = e1
} else {
pos = e2
}
}
a1 := byte('-')
a2 := byte('-')
if e2 >= e1 {
a2 = s2[e2]
}
if e2 <= e1 {
a1 = s1[e1]
}
return 1, pos, a1, a2
}

View File

@ -138,6 +138,7 @@ func OptionMaxLength(maxLength int) WithOption {
// error allowed when matching the forward
// primer.
func OptionForwardPrimer(primer string, max int) WithOption {
f := WithOption(func(opt Options) {
var err error

View File

@ -210,17 +210,22 @@ func ParseOBIFeatures(text string, annotations obiseq.Annotation) string {
bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)])
j := bytes.ReplaceAll(bvalue, []byte("'"), []byte(`"`))
var err error
if strings.HasPrefix(key, "merged_") ||
strings.HasSuffix(key, "_count") {
switch {
case strings.HasPrefix(key, "merged_") ||
strings.HasSuffix(key, "_count"):
dict := make(map[string]int)
err = json.Unmarshal(j, &dict)
value = dict
} else {
case strings.HasSuffix(key, "_status"):
dict := make(map[string]string)
err = json.Unmarshal(j, &dict)
value = dict
default:
dict := make(map[string]interface{})
err = json.Unmarshal(j, &dict)
value = dict
}
if err != nil {
value = string(bvalue)
}

View File

@ -494,6 +494,9 @@ func (iterator IBioSequenceBatch) PairWith(reverse IBioSequenceBatch,
return newIter
}
// A function that takes a predicate and returns two IBioSequenceBatch iterators.
// Sequences extracted from the input iterator are distributed among both the
// iterator following the predicate value.
func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate,
size int, sizes ...int) (IBioSequenceBatch, IBioSequenceBatch) {
buffsize := iterator.BufferSize()
@ -560,6 +563,8 @@ func (iterator IBioSequenceBatch) DivideOn(predicate obiseq.SequencePredicate,
return trueIter, falseIter
}
// Filtering a batch of sequences.
// A function that takes a predicate and a batch of sequences and returns a filtered batch of sequences.
func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate,
size int, sizes ...int) IBioSequenceBatch {
buffsize := iterator.BufferSize()
@ -615,6 +620,8 @@ func (iterator IBioSequenceBatch) FilterOn(predicate obiseq.SequencePredicate,
return trueIter.Rebatch(size)
}
// Load every sequences availables from an IBioSequenceBatch iterator into
// a large obiseq.BioSequenceSlice.
func (iterator IBioSequenceBatch) Load() obiseq.BioSequenceSlice {
chunck := obiseq.MakeBioSequenceSlice()
@ -625,4 +632,42 @@ func (iterator IBioSequenceBatch) Load() obiseq.BioSequenceSlice {
}
return chunck
}
}
// It takes a slice of BioSequence objects, and returns an iterator that will return batches of
// BioSequence objects
func IBatchOver(data obiseq.BioSequenceSlice,
size int, sizes ...int) IBioSequenceBatch {
buffsize := 0
if len(sizes) > 0 {
buffsize = sizes[1]
}
trueIter := MakeIBioSequenceBatch(buffsize)
trueIter.Add(1)
go func() {
trueIter.WaitAndClose()
}()
go func() {
ldata := len(data)
batchid := 0
next := 0
for i:=0; i < ldata; i=next {
next = i + size
if next > ldata {
next = ldata
}
trueIter.Push(MakeBioSequenceBatch(batchid,data[i:next]))
batchid++
}
trueIter.Done()
}()
return trueIter
}

View File

@ -43,6 +43,9 @@ func (sequence *BioSequence) StatsOn(key string, na string) StatsOnValues {
case StatsOnValues:
stats = istat
newstat = false
case map[string]int:
stats = istat
newstat = false
case map[string]interface{}:
stats = make(StatsOnValues, len(istat))
newstat = false

27
pkg/obistats/minmax.go Normal file
View File

@ -0,0 +1,27 @@
package obistats
func Max[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T {
m := data[0]
for _,v := range data {
if v > m {
m = v
}
}
return m
}
func Min[T int64 | int32 | int16 | int8 | int | float32 | float64] (data []T) T {
m := data[0]
for _,v := range data {
if v < m {
m = v
}
}
return m
}

49
pkg/obistats/utils.go Normal file
View File

@ -0,0 +1,49 @@
package obistats
import (
log "github.com/sirupsen/logrus"
"math"
)
// Lchoose returns logarithms of binomial coefficients
func Lchoose(n,x int) float64 {
fn := float64(n)
fx := float64(x)
ln1, _ := math.Lgamma(fn + 1.0)
lx1, _ := math.Lgamma(fx + 1.0)
lnx1, _ := math.Lgamma(fn - fx + 1.0)
return ln1 - lx1 - lnx1
}
func Choose(n,x int) float64 {
return math.Exp(Lchoose(x,n))
}
func LogAddExp(x, y float64) float64 {
tmp := x - y
if tmp > 0 {
return x + math.Log1p(math.Exp(-tmp))
} else if tmp <= 0 {
return y + math.Log1p(math.Exp(tmp))
} else {
// Nans, or infinities of the same sign involved
log.Errorf("logaddexp %f %f", x, y)
return x + y
}
}

View File

@ -0,0 +1,448 @@
package obiclean
import (
"bytes"
"fmt"
"log"
"math"
"os"
"path"
"sort"
"sync"
"text/template"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"github.com/schollz/progressbar/v3"
)
type Ratio struct {
From int
To int
Pos int
Length int
}
func abs(x int) int {
if x < 0 {
return -x
}
return x
}
func max(x, y int) int {
if x > y {
return x
}
return y
}
func min(x, y int) int {
if x < y {
return x
}
return y
}
func minMax(x, y int) (int, int) {
if x < y {
return x, y
}
return y, x
}
// It takes a filename and a 2D slice of floats pruduced during graph building,
// and writes a CSV file with the first column being the
// first nucleotide, the second column being the second nucleotide, and the third column being the
// ratio
func EmpiricalDistCsv(filename string, data [][]Ratio) {
file, err := os.Create(filename)
if err != nil {
fmt.Println(err)
}
defer file.Close()
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Save CSV stat ratio file"),
)
bar := progressbar.NewOptions(len(data), pbopt...)
fmt.Fprintln(file, "From,To,Count_from,Count_to,Position,length")
for code, dist := range data {
a1, a2 := intToNucPair(code)
for _, ratio := range dist {
fmt.Fprintf(file, "%c,%c,%d,%d,%d,%d\n", a1, a2, ratio.From, ratio.To, ratio.Pos, ratio.Length)
}
bar.Add(1)
}
}
// It takes a slice of sequences, a sample name and a statistical threshold and returns a string
// containing a GML representation of the graph
func Gml(seqs *[]*seqPCR, sample string, statThreshold int) string {
// (*seqs)[1].Count
var dot bytes.Buffer
digraphTpl := template.New("gml_digraph")
digraph := `graph [
comment "Obiclean graph for sample {{ Name }}"
directed 1
{{range $index, $data:= .}}
{{ if or $data.Fathers $data.HasSon}}
node [ id {{$index}}
graphics [
type "{{ Shape $data.Count }}"
fill "{{ if and $data.HasSon (not $data.Fathers)}}#0000FF{{ else }}#00FF00{{ end }}"
h {{ Sqrt $data.Count }}
w {{ Sqrt $data.Count }}
]
]
{{ end }}
{{ end }}
{{range $index, $data:= .}}
{{range $i, $father:= $data.Fathers}}
edge [ source {{$index}}
target {{$father}}
color "{{ if gt (index $data.Dist $i) 1 }}#FF0000{{ else }}#00FF00{{ end }}"
label "{{(index $data.Dist $i)}}"
]
{{ end }}
{{ end }}
]
`
tmpl, err := digraphTpl.Funcs(template.FuncMap{
"Sqrt": func(i int) int { return 3 * int(math.Floor(math.Sqrt(float64(i)))) },
"Name": func() string { return sample },
"Shape": func(i int) string {
if i >= statThreshold {
return "circle"
} else {
return "rectangle"
}
},
}).Parse(digraph)
if err != nil {
panic(err)
}
err = tmpl.Execute(&dot, *seqs)
if err != nil {
panic(err)
}
return dot.String()
}
func SaveGMLGraphs(dirname string,
samples map[string]*[]*seqPCR,
statThreshold int,
) {
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
// path does not exist or is not directory
os.RemoveAll(dirname)
err := os.Mkdir(dirname, 0755)
if err != nil {
log.Panicf("Cannot create directory %s for saving graphs", dirname)
}
}
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Save GML Graph files]"),
)
bar := progressbar.NewOptions(len(samples), pbopt...)
for name, seqs := range samples {
file, err := os.Create(path.Join(dirname,
fmt.Sprintf("%s.gml", name)))
if err != nil {
fmt.Println(err)
}
file.WriteString(Gml(seqs, name, statThreshold))
file.Close()
bar.Add(1)
}
}
func nucPair(a, b byte) int {
n1 := 0
switch a {
case 'a':
n1 = 1
case 'c':
n1 = 2
case 'g':
n1 = 3
case 't':
n1 = 4
}
n2 := 0
switch b {
case 'a':
n2 = 1
case 'c':
n2 = 2
case 'g':
n2 = 3
case 't':
n2 = 4
}
return n1*5 + n2
}
func intToNucPair(code int) (a, b byte) {
var decode = []byte{'-', 'a', 'c', 'g', 't'}
c1 := code / 5
c2 := code - c1*5
return decode[c1], decode[c2]
}
func buildSamplePairs(seqs *[]*seqPCR, minStatRatio int, workers int) ([][]Ratio, int) {
nseq := len(*seqs)
running := sync.WaitGroup{}
linePairs := func(i int) [][]Ratio {
ratio := make([][]Ratio, 25)
son := (*seqs)[i]
for j := i + 1; j < nseq; j++ {
father := (*seqs)[j]
d, pos, a1, a2 := obialign.D1Or0(son.Sequence, father.Sequence)
if d > 0 {
son.Fathers = append(son.Fathers, j)
son.Dist = append(son.Dist, d)
father.SonCount++
if father.Count > minStatRatio {
n := nucPair(a1, a2)
ratio[n] = append(ratio[n], Ratio{father.Count, son.Count, pos, father.Sequence.Length()})
}
}
}
return ratio
}
lineChan := make(chan int)
idxChan := make(chan [][]Ratio)
ff := func() {
for i := range lineChan {
idxChan <- linePairs(i)
}
running.Done()
}
running.Add(workers)
go func() {
running.Wait()
close(idxChan)
}()
for i := 0; i < workers; i++ {
go ff()
}
go func() {
for i := 0; i < nseq; i++ {
lineChan <- i
}
close(lineChan)
}()
np := nseq * (nseq - 1) / 2
ratio := make([][]Ratio, 25)
for data := range idxChan {
for i, r := range data {
ratio[i] = append(ratio[i], r...)
}
}
return ratio, np
}
func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int {
nseq := len(*seqs)
running := sync.WaitGroup{}
linePairs := func(matrix *obialign.LCSMatrix, i int) {
son := (*seqs)[i]
for j := i + 1; j < nseq; j++ {
father := (*seqs)[j]
d, _, _, _ := obialign.D1Or0(son.Sequence, father.Sequence)
if d < 0 {
lcs, lali := obialign.LCSScore(son.Sequence, father.Sequence,
step,
matrix)
d := (lali - lcs)
if lcs >= 0 && d <= step && step > 0 {
son.Fathers = append(son.Fathers, j)
son.Dist = append(son.Dist, d)
father.SonCount++
//a, b := minMax((*seqs)[i].Count, (*seqs)[j].Count)
}
}
}
}
lineChan := make(chan int)
// idxChan := make(chan [][]Ratio)
ff := func() {
matrix := obialign.NewLCSMatrix(nil, 150, 150, step)
for i := range lineChan {
linePairs(matrix, i)
}
running.Done()
}
running.Add(workers)
for i := 0; i < workers; i++ {
go ff()
}
go func() {
for i := 0; i < nseq; i++ {
if len((*seqs)[i].Fathers) == 0 {
lineChan <- i
}
}
close(lineChan)
}()
running.Wait()
np := nseq * (nseq - 1) / 2
return np
}
func FilterGraphOnRatio(seqs *[]*seqPCR, ratio float64) {
for _, s1 := range *seqs {
c1 := float64(s1.Count)
f := s1.Fathers
d := s1.Dist
j := 0
for i, s2 := range f {
f[j] = f[i]
d[j] = d[i]
if (c1 / float64((*seqs)[s2].Count)) <= math.Pow(ratio, float64(d[i])) {
j++
} else {
(*seqs)[s2].SonCount--
}
}
s1.Fathers = f[0:j]
s1.Dist = d[0:j]
}
}
// sortSamples sorts the sequences in each sample by their increasing count
func sortSamples(samples map[string]*([]*seqPCR)) {
for _, s := range samples {
sort.SliceStable(*s, func(i, j int) bool {
return (*s)[i].Count < (*s)[j].Count
})
}
}
func ObicleanStatus(seq *seqPCR) string {
if len(seq.Fathers) == 0 {
if seq.SonCount > 0 {
return "h"
} else {
return "s"
}
} else {
return "i"
}
}
func BuildSeqGraph(samples map[string]*[]*seqPCR,
maxError, minStatRatio, workers int) [][]Ratio {
sortSamples(samples)
npairs := 0
for _, seqs := range samples {
nseq := len(*seqs)
npairs += nseq * (nseq - 1) / 2
}
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[One error graph]"),
)
bar := progressbar.NewOptions(npairs, pbopt...)
all_ratio := make([][]Ratio, 25)
for _, seqs := range samples {
ratio, np := buildSamplePairs(seqs, minStatRatio, workers)
for i, r := range ratio {
all_ratio[i] = append(all_ratio[i], r...)
}
bar.Add(np)
}
if maxError > 1 {
pbopt = make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Adds multiple errors]"),
)
bar = progressbar.NewOptions(npairs, pbopt...)
for _, seqs := range samples {
np := extendSimilarityGraph(seqs, maxError, workers)
bar.Add(np)
}
}
return all_ratio
}

View File

@ -0,0 +1,251 @@
package obiclean
import (
"fmt"
"os"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"github.com/schollz/progressbar/v3"
log "github.com/sirupsen/logrus"
)
type seqPCR struct {
Count int // number of reads associated to a sequence in a PCR
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
SonCount int
Fathers []int
Dist []int
}
// buildSamples sorts the sequences by samples
//
// The sequences are distributed according to their sample association.
// The function returns a map indexed by sample names.
// Each sample is represented by a vector where each element associates
// a sequence to its count of occurrences in the considered sample.
func buildSamples(dataset obiseq.BioSequenceSlice,
tag, NAValue string) map[string]*([]*seqPCR) {
samples := make(map[string]*([]*seqPCR))
for _, s := range dataset {
stats := s.StatsOn(tag, NAValue)
for k, v := range stats {
pcr, ok := samples[k]
if !ok {
p := make([]*seqPCR, 0, 10)
pcr = &p
samples[k] = pcr
}
*pcr = append(*pcr, &seqPCR{
Count: v,
Sequence: s,
SonCount: 0,
})
}
}
return samples
}
func annotateOBIClean(dataset obiseq.BioSequenceSlice,
sample map[string]*([]*seqPCR),
tag, NAValue string) obiiter.IBioSequenceBatch {
batchsize := 1000
var annot = func(data obiseq.BioSequenceSlice) obiseq.BioSequenceSlice {
for _, s := range data {
status := Status(s)
head := 0
internal := 0
singleton := 0
for _, v := range status {
switch v {
case "i":
internal++
case "h":
head++
case "s":
singleton++
}
}
is_head := (head + singleton) > 0
annotation := s.Annotations()
annotation["obiclean_head"] = is_head
annotation["obiclean_singletoncount"] = singleton
annotation["obiclean_internalcount"] = internal
annotation["obiclean_headcount"] = head
annotation["obiclean_samplecount"] = head + internal + singleton
}
return data
}
iter := obiiter.IBatchOver(dataset, batchsize)
riter := iter.MakeISliceWorker(annot)
return riter
}
func IsHead(sequence *obiseq.BioSequence) bool {
annotation := sequence.Annotations()
iishead, ok := annotation["obiclean_head"]
ishead := true
if ok {
switch iishead := iishead.(type) {
case bool:
ishead = iishead
default:
log.Panic("obiclean_head attribute of sequence %s must be a boolean not : %v", sequence.Id(), iishead)
}
}
return ishead
}
func HeadCount(sequence *obiseq.BioSequence) int {
var err error
annotation := sequence.Annotations()
ivalue, ok := annotation["obiclean_headcount"]
value := 0
if ok {
value, err = goutils.InterfaceToInt(value)
if err != nil {
log.Panic("obiclean_headcount attribute of sequence %s must be an integer value not : %v", sequence.Id(), ivalue)
}
}
return value
}
func InternalCount(sequence *obiseq.BioSequence) int {
var err error
annotation := sequence.Annotations()
ivalue, ok := annotation["obiclean_internalcount"]
value := 0
if ok {
value, err = goutils.InterfaceToInt(value)
if err != nil {
log.Panic("obiclean_internalcount attribute of sequence %s must be an integer value not : %v", sequence.Id(), ivalue)
}
}
return value
}
func SingletonCount(sequence *obiseq.BioSequence) int {
var err error
annotation := sequence.Annotations()
ivalue, ok := annotation["obiclean_samplecount"]
value := 0
if ok {
value, err = goutils.InterfaceToInt(value)
if err != nil {
log.Panic("obiclean_samplecount attribute of sequence %s must be an integer value not : %v", sequence.Id(), ivalue)
}
}
return value
}
func Status(sequence *obiseq.BioSequence) map[string]string {
annotation := sequence.Annotations()
iobistatus, ok := annotation["obiclean_status"]
var obistatus map[string]string
if ok {
switch iobistatus := iobistatus.(type) {
case map[string]string:
obistatus = iobistatus
case map[string]interface{}:
obistatus = make(map[string]string)
for k, v := range iobistatus {
obistatus[k] = fmt.Sprint(v)
}
}
} else {
obistatus = make(map[string]string)
annotation["obiclean_status"] = obistatus
}
return obistatus
}
func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
db := itertator.Load()
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
samples := buildSamples(db, SampleAttribute(), "NA")
log.Infof("Dataset composed of %d samples\n", len(samples))
all_ratio := BuildSeqGraph(samples,
DistStepMax(),
MinCountToEvalMutationRate(),
obioptions.CLIParallelWorkers())
if RatioMax() < 1.0 {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Filter graph on abundance ratio]"),
)
bar := progressbar.NewOptions(len(samples), pbopt...)
for _, seqs := range samples {
FilterGraphOnRatio(seqs, RatioMax())
bar.Add(1)
}
}
if IsSaveRatioTable() {
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
}
if SaveGraphToFiles() {
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
}
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Annotate sequence status]"),
)
bar := progressbar.NewOptions(len(samples), pbopt...)
for name, seqs := range samples {
for _, pcr := range *seqs {
obistatus := Status(pcr.Sequence)
obistatus[name] = ObicleanStatus(pcr)
}
bar.Add(1)
}
iter := annotateOBIClean(db, samples, SampleAttribute(), "NA")
if OnlyHead() {
iter = iter.FilterOn(IsHead, 1000)
}
return iter
}

View File

@ -0,0 +1,113 @@
package obiclean
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _distStepMax = 1
var _sampleAttribute = "sample"
var _ratioMax = 1.0
var _minEvalRate = 1000
var _clusterMode = false
var _onlyHead = false
var _saveGraph = "__@@NOSAVE@@__"
var _saveRatio = "__@@NOSAVE@@__"
func ObicleanOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
options.Alias("s"),
options.Description("Attribute containing sample descriptions (default %s)."))
options.IntVar(&_distStepMax, "distance", _distStepMax,
options.Alias("d"),
options.Description("Maximum numbers of differences between two variant sequences (default: %d)."))
options.Float64Var(&_ratioMax, "ratio", _ratioMax,
options.Alias("r"),
options.Description("Threshold ratio between counts (rare/abundant counts)"+
" of two sequence records so that the less abundant one is a variant of "+
"the more abundant (default: %3.2f)."))
options.IntVar(&_minEvalRate, "min-eval-rate", _minEvalRate,
options.Description("Minimum abundance of a sequence to be used to evaluate mutation rate."))
// options.BoolVar(&_clusterMode, "cluster", _clusterMode,
// options.Alias("C"),
// options.Description("Switch obiclean into its clustering mode. This adds information to each sequence about the true."),
// )
options.BoolVar(&_onlyHead, "head", _onlyHead,
options.Alias("H"),
options.Description("Select only sequences with the head status in a least one sample."),
)
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
"The graph files follow the graphml format."),
)
options.StringVar(&_saveRatio, "save-ratio", _saveRatio,
options.Description("Creates a file containing the set of abundance ratio on the graph edge. "+
"The ratio file follows the csv format."),
)
}
func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options)
obiconvert.OutputOptionSet(options)
ObicleanOptionSet(options)
}
func DistStepMax() int {
return _distStepMax
}
// It returns the name of the attibute used to store sample name
func SampleAttribute() string {
return _sampleAttribute
}
// Return the maximum abundance ratio between read counts to consider sequences as error.
func RatioMax() float64 {
return _ratioMax
}
// > The function `MinCountToEvalMutationRate()` returns the minimum number of reads that must be
// observed before the mutation rate can be evaluated
func MinCountToEvalMutationRate() int {
return _minEvalRate
}
func ClusterMode() bool {
return _clusterMode
}
// `OnlyHead()` returns a boolean value that indicates whether the `-h` flag was passed to the program
func OnlyHead() bool {
return _onlyHead
}
// Returns true it the obliclean graphs must be saved
func SaveGraphToFiles() bool {
return _saveGraph != "__@@NOSAVE@@__"
}
// It returns the directory where the graph files are saved
func GraphFilesDirectory() string {
return _saveGraph
}
// Returns true it the table of ratio must be saved
func IsSaveRatioTable() bool {
return _saveRatio != "__@@NOSAVE@@__"
}
// It returns the filename of the file that stores the ratio table
func RatioTableFilename() string {
return _saveRatio
}

View File

@ -60,7 +60,7 @@ func OptionSet(options *getoptions.GetOpt) {
// --forward command line option
func ForwardPrimer() string {
pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch)
if err != nil {
log.Fatalf("%+v", err)
}