mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Add some option to the obiconsensus command
Former-commit-id: cf30a404a4943e4527106c977b01712ef454e028
This commit is contained in:
@ -13,7 +13,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
optionParser := obioptions.GenerateOptionParser(obiconvert.OptionSet)
|
optionParser := obioptions.GenerateOptionParser(obiconsensus.OptionSet)
|
||||||
|
|
||||||
_, args := optionParser(os.Args)
|
_, args := optionParser(os.Args)
|
||||||
obiconvert.SetFullFileBatch()
|
obiconvert.SetFullFileBatch()
|
||||||
@ -25,7 +25,7 @@ func main() {
|
|||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
consensus := obiconsensus.Consensus(fs, 0.95)
|
consensus := obiconsensus.Consensus(fs)
|
||||||
obiconvert.CLIWriteBioSequences(consensus, true)
|
obiconvert.CLIWriteBioSequences(consensus, true)
|
||||||
|
|
||||||
obiiter.WaitForLastPipe()
|
obiiter.WaitForLastPipe()
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
package obiconsensus
|
package obiconsensus
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path"
|
||||||
"sort"
|
"sort"
|
||||||
|
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
@ -12,25 +15,30 @@ import (
|
|||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
)
|
)
|
||||||
|
|
||||||
func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSequence, error) {
|
func BuildConsensus(seqs obiseq.BioSequenceSlice,
|
||||||
|
kmer_size int, quorum float64,
|
||||||
|
min_depth float64,
|
||||||
|
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
|
||||||
|
|
||||||
log.Printf("Number of reads : %d\n", len(seqs))
|
log.Printf("Number of reads : %d\n", len(seqs))
|
||||||
|
|
||||||
longest := make([]int, len(seqs))
|
if kmer_size < 0 {
|
||||||
|
longest := make([]int, len(seqs))
|
||||||
|
|
||||||
for i := range seqs {
|
for i := range seqs {
|
||||||
s := seqs[i : i+1]
|
s := seqs[i : i+1]
|
||||||
sa := obisuffix.BuildSuffixArray(&s)
|
sa := obisuffix.BuildSuffixArray(&s)
|
||||||
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
|
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
|
||||||
|
}
|
||||||
|
|
||||||
|
o := obiutils.Order(sort.IntSlice(longest))
|
||||||
|
i := int(float64(len(seqs)) * quorum)
|
||||||
|
|
||||||
|
kmer_size = longest[o[i]] + 1
|
||||||
|
log.Printf("estimated kmer size : %d", kmer_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
o := obiutils.Order(sort.IntSlice(longest))
|
graph := obikmer.MakeDeBruijnGraph(kmer_size)
|
||||||
i := int(float64(len(seqs)) * quorum)
|
|
||||||
|
|
||||||
kmersize := longest[o[i]] + 1
|
|
||||||
log.Printf("estimated kmer size : %d", kmersize)
|
|
||||||
|
|
||||||
graph := obikmer.MakeDeBruijnGraph(kmersize)
|
|
||||||
|
|
||||||
for _, s := range seqs {
|
for _, s := range seqs {
|
||||||
graph.Push(s)
|
graph.Push(s)
|
||||||
@ -38,50 +46,64 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSe
|
|||||||
|
|
||||||
log.Printf("Graph size : %d\n", graph.Len())
|
log.Printf("Graph size : %d\n", graph.Len())
|
||||||
total_kmer := graph.Len()
|
total_kmer := graph.Len()
|
||||||
spectrum := graph.LinkSpectrum()
|
|
||||||
cum := make(map[int]int)
|
|
||||||
|
|
||||||
spectrum[1] = 0
|
|
||||||
for i := 2; i < len(spectrum); i++ {
|
|
||||||
spectrum[i] += spectrum[i-1]
|
|
||||||
cum[spectrum[i]]++
|
|
||||||
}
|
|
||||||
|
|
||||||
max := 0
|
|
||||||
kmax := 0
|
|
||||||
for k, obs := range cum {
|
|
||||||
if obs > max {
|
|
||||||
max = obs
|
|
||||||
kmax = k
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
threshold := 0
|
threshold := 0
|
||||||
for i, total := range spectrum {
|
|
||||||
if total == kmax {
|
switch {
|
||||||
threshold = i
|
case min_depth < 0:
|
||||||
break
|
spectrum := graph.LinkSpectrum()
|
||||||
|
cum := make(map[int]int)
|
||||||
|
|
||||||
|
spectrum[1] = 0
|
||||||
|
for i := 2; i < len(spectrum); i++ {
|
||||||
|
spectrum[i] += spectrum[i-1]
|
||||||
|
cum[spectrum[i]]++
|
||||||
}
|
}
|
||||||
|
|
||||||
|
max := 0
|
||||||
|
kmax := 0
|
||||||
|
for k, obs := range cum {
|
||||||
|
if obs > max {
|
||||||
|
max = obs
|
||||||
|
kmax = k
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, total := range spectrum {
|
||||||
|
if total == kmax {
|
||||||
|
threshold = i
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
threshold /= 2
|
||||||
|
case min_depth >= 1:
|
||||||
|
threshold = int(min_depth)
|
||||||
|
default:
|
||||||
|
threshold = int(float64(len(seqs)) * min_depth)
|
||||||
}
|
}
|
||||||
threshold /= 2
|
|
||||||
graph.FilterMin(threshold)
|
graph.FilterMin(threshold)
|
||||||
|
|
||||||
log.Printf("Graph size : %d\n", graph.Len())
|
log.Printf("Graph size : %d\n", graph.Len())
|
||||||
|
|
||||||
// file, err := os.Create(
|
if save_graph {
|
||||||
// fmt.Sprintf("%s.gml", seqs[0].Source()))
|
|
||||||
|
file, err := os.Create(path.Join(dirname,
|
||||||
// if err != nil {
|
fmt.Sprintf("%s.gml", seqs[0].Source())))
|
||||||
// fmt.Println(err)
|
|
||||||
// } else {
|
if err != nil {
|
||||||
// file.WriteString(graph.GML())
|
fmt.Println(err)
|
||||||
// file.Close()
|
} else {
|
||||||
// }
|
file.WriteString(graph.Gml())
|
||||||
|
file.Close()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
seq, err := graph.LongestConsensus(seqs[0].Source())
|
seq, err := graph.LongestConsensus(seqs[0].Source())
|
||||||
|
|
||||||
seq.SetCount(len(seqs))
|
seq.SetCount(len(seqs))
|
||||||
seq.SetAttribute("seq_length", seq.Len())
|
seq.SetAttribute("seq_length", seq.Len())
|
||||||
seq.SetAttribute("kmer_size", kmersize)
|
seq.SetAttribute("kmer_size", kmer_size)
|
||||||
seq.SetAttribute("kmer_min_occur", threshold)
|
seq.SetAttribute("kmer_min_occur", threshold)
|
||||||
seq.SetAttribute("kmer_max_occur", graph.MaxLink())
|
seq.SetAttribute("kmer_max_occur", graph.MaxLink())
|
||||||
seq.SetAttribute("filtered_graph_size", graph.Len())
|
seq.SetAttribute("filtered_graph_size", graph.Len())
|
||||||
@ -90,10 +112,23 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSe
|
|||||||
return seq, err
|
return seq, err
|
||||||
}
|
}
|
||||||
|
|
||||||
func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequence {
|
func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||||
newIter := obiiter.MakeIBioSequence()
|
newIter := obiiter.MakeIBioSequence()
|
||||||
size := 10
|
size := 10
|
||||||
|
|
||||||
|
if CLISaveGraphToFiles() {
|
||||||
|
dirname := CLIGraphFilesDirectory()
|
||||||
|
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
|
||||||
|
// path does not exist or is not directory
|
||||||
|
os.RemoveAll(dirname)
|
||||||
|
err := os.Mkdir(dirname, 0755)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Panicf("Cannot create directory %s for saving graphs", dirname)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
newIter.Add(1)
|
newIter.Add(1)
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
@ -107,7 +142,12 @@ func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequen
|
|||||||
|
|
||||||
for iterator.Next() {
|
for iterator.Next() {
|
||||||
seqs := iterator.Get()
|
seqs := iterator.Get()
|
||||||
consensus, err := BuildConsensus(seqs.Slice(), quorum)
|
|
||||||
|
consensus, err := BuildConsensus(seqs.Slice(),
|
||||||
|
CLIKmerSize(), CLIThreshold(),
|
||||||
|
CLIKmerDepth(),
|
||||||
|
CLISaveGraphToFiles(), CLIGraphFilesDirectory(),
|
||||||
|
)
|
||||||
|
|
||||||
if err == nil {
|
if err == nil {
|
||||||
buffer = append(buffer, consensus)
|
buffer = append(buffer, consensus)
|
||||||
@ -118,7 +158,7 @@ func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequen
|
|||||||
order++
|
order++
|
||||||
buffer = obiseq.MakeBioSequenceSlice()
|
buffer = obiseq.MakeBioSequenceSlice()
|
||||||
}
|
}
|
||||||
seqs.Recycle()
|
seqs.Recycle(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(buffer) > 0 {
|
if len(buffer) > 0 {
|
||||||
|
@ -5,9 +5,65 @@ import (
|
|||||||
"github.com/DavidGamba/go-getoptions"
|
"github.com/DavidGamba/go-getoptions"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var _saveGraph = "__@@NOSAVE@@__"
|
||||||
|
var _kmerSize = -1
|
||||||
|
var _threshold = 0.99
|
||||||
|
var _mindepth = -1.0
|
||||||
|
|
||||||
|
func ObiconsensusOptionSet(options *getoptions.GetOpt) {
|
||||||
|
|
||||||
|
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
|
||||||
|
options.Description("Creates a directory containing the set of De Bruijn graphs used by "+
|
||||||
|
"the obiconsensus algorithm. "+
|
||||||
|
"The graph files follow the graphml format."),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
|
||||||
|
options.ArgName("SIZE"),
|
||||||
|
options.Description("The size of the kmer used to build the consensus. "+
|
||||||
|
"Default value = -1, which means that the kmer size is estimated from the data"),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.Float64Var(&_threshold, "threshold", _threshold,
|
||||||
|
options.ArgName("RATIO"),
|
||||||
|
options.Description("A threshold between O and 1 used to determine the optimal "+
|
||||||
|
"kmer size"),
|
||||||
|
)
|
||||||
|
|
||||||
|
options.Float64Var(&_mindepth, "min-depth", _mindepth,
|
||||||
|
options.ArgName("DEPTH"),
|
||||||
|
options.Description("if DEPTH is between 0 and 1, it corresponds to fraction of the "+
|
||||||
|
"reads in which a kmer must occurs to be conserved in the graph. If DEPTH is greater "+
|
||||||
|
"than 1, indicate the minimum count of occurrence for a kmer to be kept. "+
|
||||||
|
"Default value = -1, which means that the DEPTH is estimated from the data"),
|
||||||
|
)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func OptionSet(options *getoptions.GetOpt) {
|
func OptionSet(options *getoptions.GetOpt) {
|
||||||
obiconvert.InputOptionSet(options)
|
obiconvert.InputOptionSet(options)
|
||||||
obiconvert.OutputOptionSet(options)
|
obiconvert.OutputOptionSet(options)
|
||||||
|
ObiconsensusOptionSet(options)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns true it the obliclean graphs must be saved
|
||||||
|
func CLISaveGraphToFiles() bool {
|
||||||
|
return _saveGraph != "__@@NOSAVE@@__"
|
||||||
|
}
|
||||||
|
|
||||||
|
// It returns the directory where the graph files are saved
|
||||||
|
func CLIGraphFilesDirectory() string {
|
||||||
|
return _saveGraph
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIKmerSize() int {
|
||||||
|
return _kmerSize
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIKmerDepth() float64 {
|
||||||
|
return _mindepth
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLIThreshold() float64 {
|
||||||
|
return _threshold
|
||||||
|
}
|
Reference in New Issue
Block a user