Add some option to the obiconsensus command

Former-commit-id: cf30a404a4943e4527106c977b01712ef454e028
This commit is contained in:
2023-03-28 21:23:27 +07:00
parent e863dc456a
commit 4bf041be62
3 changed files with 146 additions and 50 deletions

View File

@ -13,7 +13,7 @@ import (
) )
func main() { func main() {
optionParser := obioptions.GenerateOptionParser(obiconvert.OptionSet) optionParser := obioptions.GenerateOptionParser(obiconsensus.OptionSet)
_, args := optionParser(os.Args) _, args := optionParser(os.Args)
obiconvert.SetFullFileBatch() obiconvert.SetFullFileBatch()
@ -25,7 +25,7 @@ func main() {
os.Exit(1) os.Exit(1)
} }
consensus := obiconsensus.Consensus(fs, 0.95) consensus := obiconsensus.Consensus(fs)
obiconvert.CLIWriteBioSequences(consensus, true) obiconvert.CLIWriteBioSequences(consensus, true)
obiiter.WaitForLastPipe() obiiter.WaitForLastPipe()

View File

@ -1,6 +1,9 @@
package obiconsensus package obiconsensus
import ( import (
"fmt"
"os"
"path"
"sort" "sort"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
@ -12,25 +15,30 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils" "git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
) )
func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSequence, error) { func BuildConsensus(seqs obiseq.BioSequenceSlice,
kmer_size int, quorum float64,
min_depth float64,
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
log.Printf("Number of reads : %d\n", len(seqs)) log.Printf("Number of reads : %d\n", len(seqs))
longest := make([]int, len(seqs)) if kmer_size < 0 {
longest := make([]int, len(seqs))
for i := range seqs { for i := range seqs {
s := seqs[i : i+1] s := seqs[i : i+1]
sa := obisuffix.BuildSuffixArray(&s) sa := obisuffix.BuildSuffixArray(&s)
longest[i] = obiutils.MaxSlice(sa.CommonSuffix()) longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
}
o := obiutils.Order(sort.IntSlice(longest))
i := int(float64(len(seqs)) * quorum)
kmer_size = longest[o[i]] + 1
log.Printf("estimated kmer size : %d", kmer_size)
} }
o := obiutils.Order(sort.IntSlice(longest)) graph := obikmer.MakeDeBruijnGraph(kmer_size)
i := int(float64(len(seqs)) * quorum)
kmersize := longest[o[i]] + 1
log.Printf("estimated kmer size : %d", kmersize)
graph := obikmer.MakeDeBruijnGraph(kmersize)
for _, s := range seqs { for _, s := range seqs {
graph.Push(s) graph.Push(s)
@ -38,50 +46,64 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSe
log.Printf("Graph size : %d\n", graph.Len()) log.Printf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len() total_kmer := graph.Len()
spectrum := graph.LinkSpectrum()
cum := make(map[int]int)
spectrum[1] = 0
for i := 2; i < len(spectrum); i++ {
spectrum[i] += spectrum[i-1]
cum[spectrum[i]]++
}
max := 0
kmax := 0
for k, obs := range cum {
if obs > max {
max = obs
kmax = k
}
}
threshold := 0 threshold := 0
for i, total := range spectrum {
if total == kmax { switch {
threshold = i case min_depth < 0:
break spectrum := graph.LinkSpectrum()
cum := make(map[int]int)
spectrum[1] = 0
for i := 2; i < len(spectrum); i++ {
spectrum[i] += spectrum[i-1]
cum[spectrum[i]]++
} }
max := 0
kmax := 0
for k, obs := range cum {
if obs > max {
max = obs
kmax = k
}
}
for i, total := range spectrum {
if total == kmax {
threshold = i
break
}
}
threshold /= 2
case min_depth >= 1:
threshold = int(min_depth)
default:
threshold = int(float64(len(seqs)) * min_depth)
} }
threshold /= 2
graph.FilterMin(threshold) graph.FilterMin(threshold)
log.Printf("Graph size : %d\n", graph.Len()) log.Printf("Graph size : %d\n", graph.Len())
// file, err := os.Create( if save_graph {
// fmt.Sprintf("%s.gml", seqs[0].Source()))
// if err != nil { file, err := os.Create(path.Join(dirname,
// fmt.Println(err) fmt.Sprintf("%s.gml", seqs[0].Source())))
// } else {
// file.WriteString(graph.GML()) if err != nil {
// file.Close() fmt.Println(err)
// } } else {
file.WriteString(graph.Gml())
file.Close()
}
}
seq, err := graph.LongestConsensus(seqs[0].Source()) seq, err := graph.LongestConsensus(seqs[0].Source())
seq.SetCount(len(seqs)) seq.SetCount(len(seqs))
seq.SetAttribute("seq_length", seq.Len()) seq.SetAttribute("seq_length", seq.Len())
seq.SetAttribute("kmer_size", kmersize) seq.SetAttribute("kmer_size", kmer_size)
seq.SetAttribute("kmer_min_occur", threshold) seq.SetAttribute("kmer_min_occur", threshold)
seq.SetAttribute("kmer_max_occur", graph.MaxLink()) seq.SetAttribute("kmer_max_occur", graph.MaxLink())
seq.SetAttribute("filtered_graph_size", graph.Len()) seq.SetAttribute("filtered_graph_size", graph.Len())
@ -90,10 +112,23 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSe
return seq, err return seq, err
} }
func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequence { func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
newIter := obiiter.MakeIBioSequence() newIter := obiiter.MakeIBioSequence()
size := 10 size := 10
if CLISaveGraphToFiles() {
dirname := CLIGraphFilesDirectory()
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
// path does not exist or is not directory
os.RemoveAll(dirname)
err := os.Mkdir(dirname, 0755)
if err != nil {
log.Panicf("Cannot create directory %s for saving graphs", dirname)
}
}
}
newIter.Add(1) newIter.Add(1)
go func() { go func() {
@ -107,7 +142,12 @@ func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequen
for iterator.Next() { for iterator.Next() {
seqs := iterator.Get() seqs := iterator.Get()
consensus, err := BuildConsensus(seqs.Slice(), quorum)
consensus, err := BuildConsensus(seqs.Slice(),
CLIKmerSize(), CLIThreshold(),
CLIKmerDepth(),
CLISaveGraphToFiles(), CLIGraphFilesDirectory(),
)
if err == nil { if err == nil {
buffer = append(buffer, consensus) buffer = append(buffer, consensus)
@ -118,7 +158,7 @@ func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequen
order++ order++
buffer = obiseq.MakeBioSequenceSlice() buffer = obiseq.MakeBioSequenceSlice()
} }
seqs.Recycle() seqs.Recycle(true)
} }
if len(buffer) > 0 { if len(buffer) > 0 {

View File

@ -5,9 +5,65 @@ import (
"github.com/DavidGamba/go-getoptions" "github.com/DavidGamba/go-getoptions"
) )
var _saveGraph = "__@@NOSAVE@@__"
var _kmerSize = -1
var _threshold = 0.99
var _mindepth = -1.0
func ObiconsensusOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
options.Description("Creates a directory containing the set of De Bruijn graphs used by "+
"the obiconsensus algorithm. "+
"The graph files follow the graphml format."),
)
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
options.ArgName("SIZE"),
options.Description("The size of the kmer used to build the consensus. "+
"Default value = -1, which means that the kmer size is estimated from the data"),
)
options.Float64Var(&_threshold, "threshold", _threshold,
options.ArgName("RATIO"),
options.Description("A threshold between O and 1 used to determine the optimal "+
"kmer size"),
)
options.Float64Var(&_mindepth, "min-depth", _mindepth,
options.ArgName("DEPTH"),
options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+
"reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+
"than 1, indicate the minimum count of occurrence for a kmer to be kept. "+
"Default value = -1, which means that the DEPTH is estimated from the data"),
)
}
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options) obiconvert.InputOptionSet(options)
obiconvert.OutputOptionSet(options) obiconvert.OutputOptionSet(options)
ObiconsensusOptionSet(options)
} }
// Returns true it the obliclean graphs must be saved
func CLISaveGraphToFiles() bool {
return _saveGraph != "__@@NOSAVE@@__"
}
// It returns the directory where the graph files are saved
func CLIGraphFilesDirectory() string {
return _saveGraph
}
func CLIKmerSize() int {
return _kmerSize
}
func CLIKmerDepth() float64 {
return _mindepth
}
func CLIThreshold() float64 {
return _threshold
}