mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-25 21:40:52 +00:00
This commit introduces a new configuration module `obidefault` to manage progress bar settings, allowing users to disable progress bars via a `--no-progressbar` option. It updates various packages to conditionally display progress bars based on this new configuration, improving user experience by providing control over progress bar output. The changes also include improvements to progress bar handling in several packages, ensuring they are only displayed when appropriate (e.g., when stderr is a terminal and stdout is not piped).
548 lines
11 KiB
Go
548 lines
11 KiB
Go
package obiclean
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path"
|
|
"sort"
|
|
"sync"
|
|
"text/template"
|
|
|
|
log "github.com/sirupsen/logrus"
|
|
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
|
"github.com/schollz/progressbar/v3"
|
|
)
|
|
|
|
type Ratio struct {
|
|
Sample string
|
|
SeqID string
|
|
OriginalStatus string
|
|
WOriginal int
|
|
WMutant int
|
|
COriginal int
|
|
CMutant int
|
|
Pos int
|
|
Length int
|
|
A int
|
|
C int
|
|
G int
|
|
T int
|
|
}
|
|
|
|
type Edge struct {
|
|
Father int
|
|
From byte
|
|
To byte
|
|
Pos int
|
|
NucPair int
|
|
Dist int
|
|
}
|
|
|
|
func makeEdge(father, dist, pos int, from, to byte) Edge {
|
|
return Edge{
|
|
Father: father,
|
|
Dist: dist,
|
|
Pos: pos,
|
|
From: from,
|
|
To: to,
|
|
NucPair: nucPair(from, to),
|
|
}
|
|
}
|
|
|
|
// It takes a filename and a 2D slice of floats pruduced during graph building,
|
|
// and writes a CSV file with the first column being the
|
|
// first nucleotide, the second column being the second nucleotide, and the third column being the
|
|
// ratio
|
|
func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
|
file, err := os.Create(filename)
|
|
if err != nil {
|
|
fmt.Println(err)
|
|
}
|
|
|
|
destfile, err := obiutils.CompressStream(file, true, true)
|
|
if err != nil {
|
|
fmt.Println(err)
|
|
}
|
|
defer destfile.Close()
|
|
|
|
var bar *progressbar.ProgressBar
|
|
if obidefault.ProgressBar() {
|
|
pbopt := make([]progressbar.Option, 0, 5)
|
|
pbopt = append(pbopt,
|
|
progressbar.OptionSetWriter(os.Stderr),
|
|
progressbar.OptionSetWidth(15),
|
|
progressbar.OptionShowIts(),
|
|
progressbar.OptionSetPredictTime(true),
|
|
progressbar.OptionSetDescription("[Save CSV stat ratio file]"),
|
|
)
|
|
bar = progressbar.NewOptions(len(data), pbopt...)
|
|
}
|
|
|
|
fmt.Fprintln(destfile, "Sample,Origin_id,Origin_status,Origin,Mutant,Origin_Weight,Mutant_Weight,Origin_Count,Mutant_Count,Position,Origin_length,A,C,G,T")
|
|
for code, dist := range data {
|
|
a1, a2 := intToNucPair(code)
|
|
for _, ratio := range dist {
|
|
fmt.Fprintf(destfile, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
|
|
ratio.Sample,
|
|
ratio.SeqID,
|
|
ratio.OriginalStatus,
|
|
a1, a2,
|
|
ratio.WOriginal,
|
|
ratio.WMutant,
|
|
ratio.COriginal,
|
|
ratio.CMutant,
|
|
ratio.Pos,
|
|
ratio.Length,
|
|
ratio.A,
|
|
ratio.C,
|
|
ratio.G,
|
|
ratio.T,
|
|
)
|
|
}
|
|
if bar != nil {
|
|
bar.Add(1)
|
|
}
|
|
}
|
|
}
|
|
|
|
// It takes a slice of sequences, a sample name and a statistical threshold and returns a string
|
|
// containing a GML representation of the graph
|
|
func Gml(seqs *[]*seqPCR, sample string, statThreshold int) string {
|
|
// (*seqs)[1].Count
|
|
var dot bytes.Buffer
|
|
digraphTpl := template.New("gml_digraph")
|
|
digraph := `graph [
|
|
comment "Obiclean graph for sample {{ Name }}"
|
|
directed 1
|
|
{{range $index, $data:= .}}
|
|
{{ if or $data.Edges (gt $data.SonCount 0)}}
|
|
node [ id {{$index}}
|
|
graphics [
|
|
type "{{ Shape $data.Count }}"
|
|
fill "{{ if and (gt $data.SonCount 0) (not $data.Edges)}}#0000FF{{ else }}#00FF00{{ end }}"
|
|
h {{ Sqrt $data.Count }}
|
|
w {{ Sqrt $data.Count }}
|
|
]
|
|
weight {{$data.Count}}
|
|
]
|
|
{{ end }}
|
|
{{ end }}
|
|
|
|
{{range $index, $data:= .}}
|
|
{{range $i, $edge:= $data.Edges}}
|
|
edge [ source {{$index}}
|
|
target {{$edge.Father}}
|
|
color "{{ if gt (index $data.Edges $i).Dist 1 }}#FF0000{{ else }}#00FF00{{ end }}"
|
|
label "{{(index $data.Edges $i).Dist}}"
|
|
]
|
|
{{ end }}
|
|
{{ end }}
|
|
]
|
|
|
|
`
|
|
|
|
tmpl, err := digraphTpl.Funcs(template.FuncMap{
|
|
"Sqrt": func(i int) int { return 3 * int(math.Floor(math.Sqrt(float64(i)))) },
|
|
"Name": func() string { return sample },
|
|
"Shape": func(i int) string {
|
|
if i >= statThreshold {
|
|
return "circle"
|
|
} else {
|
|
return "rectangle"
|
|
}
|
|
},
|
|
}).Parse(digraph)
|
|
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
err = tmpl.Execute(&dot, *seqs)
|
|
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
|
|
return dot.String()
|
|
}
|
|
|
|
func SaveGMLGraphs(dirname string,
|
|
samples map[string]*[]*seqPCR,
|
|
statThreshold int,
|
|
) {
|
|
|
|
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
|
|
// path does not exist or is not directory
|
|
os.RemoveAll(dirname)
|
|
err := os.Mkdir(dirname, 0755)
|
|
|
|
if err != nil {
|
|
log.Panicf("Cannot create directory %s for saving graphs", dirname)
|
|
}
|
|
}
|
|
|
|
var bar *progressbar.ProgressBar
|
|
if obidefault.ProgressBar() {
|
|
pbopt := make([]progressbar.Option, 0, 5)
|
|
pbopt = append(pbopt,
|
|
progressbar.OptionSetWriter(os.Stderr),
|
|
progressbar.OptionSetWidth(15),
|
|
progressbar.OptionShowIts(),
|
|
progressbar.OptionSetPredictTime(true),
|
|
progressbar.OptionSetDescription("[Save GML Graph files]"),
|
|
)
|
|
bar = progressbar.NewOptions(len(samples), pbopt...)
|
|
}
|
|
|
|
for name, seqs := range samples {
|
|
|
|
file, err := os.Create(path.Join(dirname,
|
|
fmt.Sprintf("%s.gml", name)))
|
|
|
|
if err != nil {
|
|
fmt.Println(err)
|
|
}
|
|
|
|
file.WriteString(Gml(seqs, name, statThreshold))
|
|
file.Close()
|
|
|
|
if bar != nil {
|
|
bar.Add(1)
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
func nucPair(a, b byte) int {
|
|
|
|
n1 := 0
|
|
switch a {
|
|
case 'a':
|
|
n1 = 1
|
|
case 'c':
|
|
n1 = 2
|
|
case 'g':
|
|
n1 = 3
|
|
case 't':
|
|
n1 = 4
|
|
}
|
|
|
|
n2 := 0
|
|
switch b {
|
|
case 'a':
|
|
n2 = 1
|
|
case 'c':
|
|
n2 = 2
|
|
case 'g':
|
|
n2 = 3
|
|
case 't':
|
|
n2 = 4
|
|
}
|
|
|
|
return n1*5 + n2
|
|
|
|
}
|
|
|
|
func intToNucPair(code int) (a, b byte) {
|
|
var decode = []byte{'-', 'a', 'c', 'g', 't'}
|
|
c1 := code / 5
|
|
c2 := code - c1*5
|
|
|
|
return decode[c1], decode[c2]
|
|
}
|
|
|
|
func reweightSequences(seqs *[]*seqPCR) {
|
|
|
|
for _, node := range *seqs {
|
|
node.Weight = node.Count
|
|
}
|
|
|
|
//var rfunc func(*seqPCR)
|
|
|
|
rfunc := func(node *seqPCR) {
|
|
node.AddedSons = 0
|
|
nedges := len(node.Edges)
|
|
if nedges > 0 {
|
|
swf := 0.0
|
|
|
|
for k := 0; k < nedges; k++ {
|
|
swf += float64((*seqs)[node.Edges[k].Father].Count)
|
|
}
|
|
|
|
for k := 0; k < nedges; k++ {
|
|
father := (*seqs)[node.Edges[k].Father]
|
|
father.Weight += int(math.Round(float64(node.Weight) * float64(father.Count) / swf))
|
|
father.AddedSons++
|
|
// log.Println(father.AddedSons, father.SonCount)
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
for _, node := range *seqs {
|
|
if node.SonCount == 0 {
|
|
rfunc(node)
|
|
}
|
|
}
|
|
|
|
for done := true; done; {
|
|
done = false
|
|
for _, node := range *seqs {
|
|
if node.SonCount > 0 && node.SonCount == node.AddedSons {
|
|
// log.Println(node.AddedSons, node.SonCount)
|
|
rfunc(node)
|
|
done = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func buildSamplePairs(seqs *[]*seqPCR, workers int) int {
|
|
nseq := len(*seqs)
|
|
running := sync.WaitGroup{}
|
|
|
|
linePairs := func(i int) {
|
|
|
|
son := (*seqs)[i]
|
|
|
|
for j := i + 1; j < nseq; j++ {
|
|
father := (*seqs)[j]
|
|
if father.Count > son.Count {
|
|
d, pos, a1, a2 := obialign.D1Or0(son.Sequence, father.Sequence)
|
|
if d > 0 {
|
|
son.Edges = append(son.Edges, makeEdge(j, d, pos, a2, a1))
|
|
father.SonCount++
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
lineChan := make(chan int)
|
|
|
|
ff := func() {
|
|
for i := range lineChan {
|
|
linePairs(i)
|
|
}
|
|
running.Done()
|
|
}
|
|
|
|
running.Add(workers)
|
|
|
|
for i := 0; i < workers; i++ {
|
|
go ff()
|
|
}
|
|
|
|
go func() {
|
|
for i := 0; i < nseq; i++ {
|
|
lineChan <- i
|
|
}
|
|
close(lineChan)
|
|
}()
|
|
|
|
np := nseq * (nseq - 1) / 2
|
|
|
|
running.Wait()
|
|
|
|
reweightSequences(seqs)
|
|
|
|
return np
|
|
}
|
|
|
|
func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int {
|
|
nseq := len(*seqs)
|
|
running := sync.WaitGroup{}
|
|
|
|
linePairs := func(matrix *[]uint64, i int) {
|
|
son := (*seqs)[i]
|
|
for j := i + 1; j < nseq; j++ {
|
|
father := (*seqs)[j]
|
|
if father.Count > son.Count {
|
|
d, _, _, _ := obialign.D1Or0(son.Sequence, father.Sequence)
|
|
|
|
if d < 0 {
|
|
lcs, lali := obialign.FastLCSScore(son.Sequence, father.Sequence,
|
|
step,
|
|
matrix)
|
|
d := (lali - lcs)
|
|
if lcs >= 0 && d <= step && step > 0 {
|
|
son.Edges = append(son.Edges, makeEdge(j, d, -1, '-', '-'))
|
|
father.SonCount++
|
|
//a, b := minMax((*seqs)[i].Count, (*seqs)[j].Count)
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
lineChan := make(chan int)
|
|
// idxChan := make(chan [][]Ratio)
|
|
|
|
ff := func() {
|
|
var matrix []uint64
|
|
|
|
for i := range lineChan {
|
|
linePairs(&matrix, i)
|
|
}
|
|
|
|
running.Done()
|
|
}
|
|
|
|
running.Add(workers)
|
|
|
|
for i := 0; i < workers; i++ {
|
|
go ff()
|
|
}
|
|
|
|
go func() {
|
|
for i := 0; i < nseq; i++ {
|
|
if len((*seqs)[i].Edges) == 0 {
|
|
lineChan <- i
|
|
}
|
|
}
|
|
close(lineChan)
|
|
}()
|
|
|
|
running.Wait()
|
|
np := nseq * (nseq - 1) / 2
|
|
return np
|
|
}
|
|
|
|
func FilterGraphOnRatio(seqs *[]*seqPCR, ratio float64) {
|
|
for _, s1 := range *seqs {
|
|
c1 := float64(s1.Weight)
|
|
e := s1.Edges
|
|
j := 0
|
|
for i, s2 := range e {
|
|
e[j] = e[i]
|
|
// log.Warnf("ratio %f, dist: %d, threshold %f",
|
|
// c1/float64((*seqs)[s2.Father].Weight),
|
|
// e[i].Dist, math.Pow(ratio, float64(e[i].Dist)))
|
|
if (c1 / float64((*seqs)[s2.Father].Weight)) < math.Pow(ratio, float64(e[i].Dist)) {
|
|
j++
|
|
} else {
|
|
(*seqs)[s2.Father].SonCount--
|
|
}
|
|
}
|
|
s1.Edges = e[0:j]
|
|
}
|
|
}
|
|
|
|
// sortSamples sorts the sequences in each sample by their increasing count
|
|
func sortSamples(samples map[string]*([]*seqPCR)) {
|
|
|
|
for _, s := range samples {
|
|
sort.SliceStable(*s, func(i, j int) bool {
|
|
return (*s)[i].Count < (*s)[j].Count
|
|
})
|
|
}
|
|
|
|
}
|
|
|
|
func ObicleanStatus(seq *seqPCR) string {
|
|
if len(seq.Edges) == 0 {
|
|
if seq.SonCount > 0 {
|
|
return "h"
|
|
} else {
|
|
return "s"
|
|
}
|
|
} else {
|
|
return "i"
|
|
}
|
|
}
|
|
|
|
func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
|
|
ratio := make([][]Ratio, 25)
|
|
|
|
for name, seqs := range samples {
|
|
|
|
for _, seq := range *seqs {
|
|
for _, edge := range seq.Edges {
|
|
father := (*seqs)[edge.Father]
|
|
if father.Weight >= minStatRatio && edge.Dist == 1 {
|
|
s := father.Sequence.Sequence()
|
|
ratio[edge.NucPair] = append(ratio[edge.NucPair],
|
|
Ratio{
|
|
Sample: name,
|
|
SeqID: father.Sequence.Id(),
|
|
OriginalStatus: Status(father.Sequence)[name],
|
|
WOriginal: father.Weight,
|
|
WMutant: seq.Weight,
|
|
COriginal: father.Count,
|
|
CMutant: seq.Count,
|
|
Pos: edge.Pos,
|
|
Length: father.Sequence.Len(),
|
|
A: bytes.Count(s, []byte("a")),
|
|
C: bytes.Count(s, []byte("c")),
|
|
G: bytes.Count(s, []byte("g")),
|
|
T: bytes.Count(s, []byte("t"))})
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
return ratio
|
|
|
|
}
|
|
|
|
func BuildSeqGraph(samples map[string]*[]*seqPCR,
|
|
maxError, workers int) {
|
|
|
|
sortSamples(samples)
|
|
|
|
npairs := 0
|
|
for _, seqs := range samples {
|
|
nseq := len(*seqs)
|
|
npairs += nseq * (nseq - 1) / 2
|
|
}
|
|
|
|
var bar *progressbar.ProgressBar
|
|
if obidefault.ProgressBar() {
|
|
pbopt := make([]progressbar.Option, 0, 5)
|
|
pbopt = append(pbopt,
|
|
progressbar.OptionSetWriter(os.Stderr),
|
|
progressbar.OptionSetWidth(15),
|
|
progressbar.OptionShowIts(),
|
|
progressbar.OptionSetPredictTime(true),
|
|
progressbar.OptionSetDescription("[One error graph]"),
|
|
)
|
|
bar = progressbar.NewOptions(npairs, pbopt...)
|
|
}
|
|
|
|
for _, seqs := range samples {
|
|
np := buildSamplePairs(seqs, workers)
|
|
if bar != nil {
|
|
bar.Add(np)
|
|
}
|
|
}
|
|
|
|
if maxError > 1 {
|
|
if obidefault.ProgressBar() {
|
|
pbopt := make([]progressbar.Option, 0, 5)
|
|
pbopt = append(pbopt,
|
|
progressbar.OptionSetWriter(os.Stderr),
|
|
progressbar.OptionSetWidth(15),
|
|
progressbar.OptionShowIts(),
|
|
progressbar.OptionSetPredictTime(true),
|
|
progressbar.OptionSetDescription("[Adds multiple errors]"),
|
|
)
|
|
bar = progressbar.NewOptions(npairs, pbopt...)
|
|
}
|
|
|
|
for _, seqs := range samples {
|
|
np := extendSimilarityGraph(seqs, maxError, workers)
|
|
if bar != nil {
|
|
bar.Add(np)
|
|
}
|
|
}
|
|
}
|
|
}
|