Files
obitools4/pkg/obitools/obiclean/graph.go
Eric Coissac 1a28d5ed64 Add progress bar configuration and conditional display
This commit introduces a new configuration module `obidefault` to manage progress bar settings, allowing users to disable progress bars via a `--no-progressbar` option. It updates various packages to conditionally display progress bars based on this new configuration, improving user experience by providing control over progress bar output. The changes also include improvements to progress bar handling in several packages, ensuring they are only displayed when appropriate (e.g., when stderr is a terminal and stdout is not piped).
2026-02-08 16:14:02 +01:00

548 lines
11 KiB
Go

package obiclean
import (
"bytes"
"fmt"
"math"
"os"
"path"
"sort"
"sync"
"text/template"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/schollz/progressbar/v3"
)
type Ratio struct {
Sample string
SeqID string
OriginalStatus string
WOriginal int
WMutant int
COriginal int
CMutant int
Pos int
Length int
A int
C int
G int
T int
}
type Edge struct {
Father int
From byte
To byte
Pos int
NucPair int
Dist int
}
func makeEdge(father, dist, pos int, from, to byte) Edge {
return Edge{
Father: father,
Dist: dist,
Pos: pos,
From: from,
To: to,
NucPair: nucPair(from, to),
}
}
// It takes a filename and a 2D slice of floats pruduced during graph building,
// and writes a CSV file with the first column being the
// first nucleotide, the second column being the second nucleotide, and the third column being the
// ratio
func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
file, err := os.Create(filename)
if err != nil {
fmt.Println(err)
}
destfile, err := obiutils.CompressStream(file, true, true)
if err != nil {
fmt.Println(err)
}
defer destfile.Close()
var bar *progressbar.ProgressBar
if obidefault.ProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Save CSV stat ratio file]"),
)
bar = progressbar.NewOptions(len(data), pbopt...)
}
fmt.Fprintln(destfile, "Sample,Origin_id,Origin_status,Origin,Mutant,Origin_Weight,Mutant_Weight,Origin_Count,Mutant_Count,Position,Origin_length,A,C,G,T")
for code, dist := range data {
a1, a2 := intToNucPair(code)
for _, ratio := range dist {
fmt.Fprintf(destfile, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
ratio.Sample,
ratio.SeqID,
ratio.OriginalStatus,
a1, a2,
ratio.WOriginal,
ratio.WMutant,
ratio.COriginal,
ratio.CMutant,
ratio.Pos,
ratio.Length,
ratio.A,
ratio.C,
ratio.G,
ratio.T,
)
}
if bar != nil {
bar.Add(1)
}
}
}
// It takes a slice of sequences, a sample name and a statistical threshold and returns a string
// containing a GML representation of the graph
func Gml(seqs *[]*seqPCR, sample string, statThreshold int) string {
// (*seqs)[1].Count
var dot bytes.Buffer
digraphTpl := template.New("gml_digraph")
digraph := `graph [
comment "Obiclean graph for sample {{ Name }}"
directed 1
{{range $index, $data:= .}}
{{ if or $data.Edges (gt $data.SonCount 0)}}
node [ id {{$index}}
graphics [
type "{{ Shape $data.Count }}"
fill "{{ if and (gt $data.SonCount 0) (not $data.Edges)}}#0000FF{{ else }}#00FF00{{ end }}"
h {{ Sqrt $data.Count }}
w {{ Sqrt $data.Count }}
]
weight {{$data.Count}}
]
{{ end }}
{{ end }}
{{range $index, $data:= .}}
{{range $i, $edge:= $data.Edges}}
edge [ source {{$index}}
target {{$edge.Father}}
color "{{ if gt (index $data.Edges $i).Dist 1 }}#FF0000{{ else }}#00FF00{{ end }}"
label "{{(index $data.Edges $i).Dist}}"
]
{{ end }}
{{ end }}
]
`
tmpl, err := digraphTpl.Funcs(template.FuncMap{
"Sqrt": func(i int) int { return 3 * int(math.Floor(math.Sqrt(float64(i)))) },
"Name": func() string { return sample },
"Shape": func(i int) string {
if i >= statThreshold {
return "circle"
} else {
return "rectangle"
}
},
}).Parse(digraph)
if err != nil {
panic(err)
}
err = tmpl.Execute(&dot, *seqs)
if err != nil {
panic(err)
}
return dot.String()
}
func SaveGMLGraphs(dirname string,
samples map[string]*[]*seqPCR,
statThreshold int,
) {
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
// path does not exist or is not directory
os.RemoveAll(dirname)
err := os.Mkdir(dirname, 0755)
if err != nil {
log.Panicf("Cannot create directory %s for saving graphs", dirname)
}
}
var bar *progressbar.ProgressBar
if obidefault.ProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Save GML Graph files]"),
)
bar = progressbar.NewOptions(len(samples), pbopt...)
}
for name, seqs := range samples {
file, err := os.Create(path.Join(dirname,
fmt.Sprintf("%s.gml", name)))
if err != nil {
fmt.Println(err)
}
file.WriteString(Gml(seqs, name, statThreshold))
file.Close()
if bar != nil {
bar.Add(1)
}
}
}
func nucPair(a, b byte) int {
n1 := 0
switch a {
case 'a':
n1 = 1
case 'c':
n1 = 2
case 'g':
n1 = 3
case 't':
n1 = 4
}
n2 := 0
switch b {
case 'a':
n2 = 1
case 'c':
n2 = 2
case 'g':
n2 = 3
case 't':
n2 = 4
}
return n1*5 + n2
}
func intToNucPair(code int) (a, b byte) {
var decode = []byte{'-', 'a', 'c', 'g', 't'}
c1 := code / 5
c2 := code - c1*5
return decode[c1], decode[c2]
}
func reweightSequences(seqs *[]*seqPCR) {
for _, node := range *seqs {
node.Weight = node.Count
}
//var rfunc func(*seqPCR)
rfunc := func(node *seqPCR) {
node.AddedSons = 0
nedges := len(node.Edges)
if nedges > 0 {
swf := 0.0
for k := 0; k < nedges; k++ {
swf += float64((*seqs)[node.Edges[k].Father].Count)
}
for k := 0; k < nedges; k++ {
father := (*seqs)[node.Edges[k].Father]
father.Weight += int(math.Round(float64(node.Weight) * float64(father.Count) / swf))
father.AddedSons++
// log.Println(father.AddedSons, father.SonCount)
}
}
}
for _, node := range *seqs {
if node.SonCount == 0 {
rfunc(node)
}
}
for done := true; done; {
done = false
for _, node := range *seqs {
if node.SonCount > 0 && node.SonCount == node.AddedSons {
// log.Println(node.AddedSons, node.SonCount)
rfunc(node)
done = true
}
}
}
}
func buildSamplePairs(seqs *[]*seqPCR, workers int) int {
nseq := len(*seqs)
running := sync.WaitGroup{}
linePairs := func(i int) {
son := (*seqs)[i]
for j := i + 1; j < nseq; j++ {
father := (*seqs)[j]
if father.Count > son.Count {
d, pos, a1, a2 := obialign.D1Or0(son.Sequence, father.Sequence)
if d > 0 {
son.Edges = append(son.Edges, makeEdge(j, d, pos, a2, a1))
father.SonCount++
}
}
}
}
lineChan := make(chan int)
ff := func() {
for i := range lineChan {
linePairs(i)
}
running.Done()
}
running.Add(workers)
for i := 0; i < workers; i++ {
go ff()
}
go func() {
for i := 0; i < nseq; i++ {
lineChan <- i
}
close(lineChan)
}()
np := nseq * (nseq - 1) / 2
running.Wait()
reweightSequences(seqs)
return np
}
func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int {
nseq := len(*seqs)
running := sync.WaitGroup{}
linePairs := func(matrix *[]uint64, i int) {
son := (*seqs)[i]
for j := i + 1; j < nseq; j++ {
father := (*seqs)[j]
if father.Count > son.Count {
d, _, _, _ := obialign.D1Or0(son.Sequence, father.Sequence)
if d < 0 {
lcs, lali := obialign.FastLCSScore(son.Sequence, father.Sequence,
step,
matrix)
d := (lali - lcs)
if lcs >= 0 && d <= step && step > 0 {
son.Edges = append(son.Edges, makeEdge(j, d, -1, '-', '-'))
father.SonCount++
//a, b := minMax((*seqs)[i].Count, (*seqs)[j].Count)
}
}
}
}
}
lineChan := make(chan int)
// idxChan := make(chan [][]Ratio)
ff := func() {
var matrix []uint64
for i := range lineChan {
linePairs(&matrix, i)
}
running.Done()
}
running.Add(workers)
for i := 0; i < workers; i++ {
go ff()
}
go func() {
for i := 0; i < nseq; i++ {
if len((*seqs)[i].Edges) == 0 {
lineChan <- i
}
}
close(lineChan)
}()
running.Wait()
np := nseq * (nseq - 1) / 2
return np
}
func FilterGraphOnRatio(seqs *[]*seqPCR, ratio float64) {
for _, s1 := range *seqs {
c1 := float64(s1.Weight)
e := s1.Edges
j := 0
for i, s2 := range e {
e[j] = e[i]
// log.Warnf("ratio %f, dist: %d, threshold %f",
// c1/float64((*seqs)[s2.Father].Weight),
// e[i].Dist, math.Pow(ratio, float64(e[i].Dist)))
if (c1 / float64((*seqs)[s2.Father].Weight)) < math.Pow(ratio, float64(e[i].Dist)) {
j++
} else {
(*seqs)[s2.Father].SonCount--
}
}
s1.Edges = e[0:j]
}
}
// sortSamples sorts the sequences in each sample by their increasing count
func sortSamples(samples map[string]*([]*seqPCR)) {
for _, s := range samples {
sort.SliceStable(*s, func(i, j int) bool {
return (*s)[i].Count < (*s)[j].Count
})
}
}
func ObicleanStatus(seq *seqPCR) string {
if len(seq.Edges) == 0 {
if seq.SonCount > 0 {
return "h"
} else {
return "s"
}
} else {
return "i"
}
}
func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
ratio := make([][]Ratio, 25)
for name, seqs := range samples {
for _, seq := range *seqs {
for _, edge := range seq.Edges {
father := (*seqs)[edge.Father]
if father.Weight >= minStatRatio && edge.Dist == 1 {
s := father.Sequence.Sequence()
ratio[edge.NucPair] = append(ratio[edge.NucPair],
Ratio{
Sample: name,
SeqID: father.Sequence.Id(),
OriginalStatus: Status(father.Sequence)[name],
WOriginal: father.Weight,
WMutant: seq.Weight,
COriginal: father.Count,
CMutant: seq.Count,
Pos: edge.Pos,
Length: father.Sequence.Len(),
A: bytes.Count(s, []byte("a")),
C: bytes.Count(s, []byte("c")),
G: bytes.Count(s, []byte("g")),
T: bytes.Count(s, []byte("t"))})
}
}
}
}
return ratio
}
func BuildSeqGraph(samples map[string]*[]*seqPCR,
maxError, workers int) {
sortSamples(samples)
npairs := 0
for _, seqs := range samples {
nseq := len(*seqs)
npairs += nseq * (nseq - 1) / 2
}
var bar *progressbar.ProgressBar
if obidefault.ProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[One error graph]"),
)
bar = progressbar.NewOptions(npairs, pbopt...)
}
for _, seqs := range samples {
np := buildSamplePairs(seqs, workers)
if bar != nil {
bar.Add(np)
}
}
if maxError > 1 {
if obidefault.ProgressBar() {
pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt,
progressbar.OptionSetWriter(os.Stderr),
progressbar.OptionSetWidth(15),
progressbar.OptionShowIts(),
progressbar.OptionSetPredictTime(true),
progressbar.OptionSetDescription("[Adds multiple errors]"),
)
bar = progressbar.NewOptions(npairs, pbopt...)
}
for _, seqs := range samples {
np := extendSimilarityGraph(seqs, maxError, workers)
if bar != nil {
bar.Add(np)
}
}
}
}