Add obiminion first version

Former-commit-id: aa5ace7bd4d2266333715fca7094d1c3cbbb5e6d
This commit is contained in:
Eric Coissac
2024-05-14 08:16:12 +02:00
parent 9e63013bc2
commit 017030bcce
24 changed files with 1599 additions and 469 deletions

View File

@@ -4,92 +4,93 @@ import (
"fmt"
"os"
"path"
"sort"
"slices"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obisuffix"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
)
func BuildConsensus(seqs obiseq.BioSequenceSlice,
consensus_id string,
kmer_size int, quorum float64,
min_depth float64,
max_length int,
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
if save_graph {
if dirname == "" {
dirname = "."
}
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
// path does not exist or is not directory
os.RemoveAll(dirname)
err := os.Mkdir(dirname, 0755)
if err != nil {
log.Panicf("Cannot create directory %s for saving graphs", dirname)
}
}
fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s.fasta", consensus_id)))
if err == nil {
defer fasta.Close()
fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(0, seqs), obiformats.FormatFastSeqJsonHeader, false))
fasta.Close()
}
}
log.Printf("Number of reads : %d\n", len(seqs))
if kmer_size < 0 {
longest := make([]int, len(seqs))
for i := range seqs {
s := seqs[i : i+1]
for i, seq := range seqs {
s := obiseq.BioSequenceSlice{seq}
sa := obisuffix.BuildSuffixArray(&s)
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
longest[i] = slices.Max(sa.CommonSuffix())
}
o := obiutils.Order(sort.IntSlice(longest))
i := int(float64(len(seqs)) * quorum)
// o := obiutils.Order(sort.IntSlice(longest))
// i := int(float64(len(seqs)) * quorum)
kmer_size = longest[o[i]] + 1
// if i >= len(o) {
// i = len(o) - 1
// }
kmer_size = slices.Max(longest) + 1
// kmer_size = longest[o[i]] + 1
log.Printf("estimated kmer size : %d", kmer_size)
}
graph := obikmer.MakeDeBruijnGraph(kmer_size)
var graph *obikmer.DeBruijnGraph
for {
graph = obikmer.MakeDeBruijnGraph(kmer_size)
for _, s := range seqs {
graph.Push(s)
for _, s := range seqs {
graph.Push(s)
}
if !graph.HasCycle() {
break
}
kmer_size++
log.Infof("Cycle detected, increasing kmer size to %d\n", kmer_size)
}
log.Printf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
threshold := 0
switch {
case min_depth < 0:
spectrum := graph.LinkSpectrum()
cum := make(map[int]int)
spectrum[1] = 0
for i := 2; i < len(spectrum); i++ {
spectrum[i] += spectrum[i-1]
cum[spectrum[i]]++
}
max := 0
kmax := 0
for k, obs := range cum {
if obs > max {
max = obs
kmax = k
}
}
for i, total := range spectrum {
if total == kmax {
threshold = i
break
}
}
threshold /= 2
case min_depth >= 1:
threshold = int(min_depth)
default:
threshold = int(float64(len(seqs)) * min_depth)
}
graph.FilterMin(threshold)
log.Printf("Graph size : %d\n", graph.Len())
if save_graph {
file, err := os.Create(path.Join(dirname,
fmt.Sprintf("%s.gml", seqs[0].Source())))
fmt.Sprintf("%s_raw_consensus.gml", consensus_id)))
if err != nil {
fmt.Println(err)
@@ -99,29 +100,133 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
}
}
id := seqs[0].Source()
if id == "" {
id = seqs[0].Id()
}
seq, err := graph.LongestConsensus(id)
log.Printf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
// threshold := 0
// switch {
// case min_depth < 0:
// spectrum := graph.WeightSpectrum()
// cum := make(map[int]int)
// spectrum[1] = 0
// for i := 2; i < len(spectrum); i++ {
// spectrum[i] += spectrum[i-1]
// cum[spectrum[i]]++
// }
// max := 0
// kmax := 0
// for k, obs := range cum {
// if obs > max {
// max = obs
// kmax = k
// }
// }
// for i, total := range spectrum {
// if total == kmax {
// threshold = i
// break
// }
// }
// threshold /= 2
// if threshold < 1 {
// threshold = 1
// }
// log.Info("Estimated kmer_min_occur = ", threshold)
// case min_depth >= 1:
// threshold = int(min_depth)
// default:
// threshold = int(float64(len(seqs)) * min_depth)
// }
// graph.FilterMinWeight(threshold)
// log.Printf("Graph size : %d\n", graph.Len())
// if save_graph {
// file, err := os.Create(path.Join(dirname,
// fmt.Sprintf("%s_consensus.gml", consensus_id)))
// if err != nil {
// fmt.Println(err)
// } else {
// file.WriteString(graph.Gml())
// file.Close()
// }
// }
seq, err := graph.LongestConsensus(consensus_id, max_length)
sumCount := 0
for _, s := range seqs {
sumCount += s.Count()
if seq != nil {
for _, s := range seqs {
sumCount += s.Count()
}
seq.SetCount(sumCount)
seq.SetAttribute("seq_length", seq.Len())
seq.SetAttribute("kmer_size", kmer_size)
//seq.SetAttribute("kmer_min_occur", threshold)
seq.SetAttribute("kmer_max_occur", graph.MaxWeight())
seq.SetAttribute("filtered_graph_size", graph.Len())
seq.SetAttribute("full_graph_size", total_kmer)
}
seq.SetCount(sumCount)
seq.SetAttribute("seq_length", seq.Len())
seq.SetAttribute("kmer_size", kmer_size)
seq.SetAttribute("kmer_min_occur", threshold)
seq.SetAttribute("kmer_max_occur", graph.MaxLink())
seq.SetAttribute("filtered_graph_size", graph.Len())
seq.SetAttribute("full_graph_size", total_kmer)
return seq, err
}
// func BuildConsensusWithTimeout(seqs obiseq.BioSequenceSlice,
// kmer_size int, quorum float64,
// min_depth float64,
// save_graph bool, dirname string, timeout time.Duration) (*obiseq.BioSequence, error) {
// ctx, cancel := context.WithTimeout(context.Background(), timeout)
// defer cancel()
// consensus := func() *obiseq.BioSequence {
// cons, err := BuildConsensus(seqs, kmer_size, quorum, min_depth, save_graph, dirname,)
// if err != nil {
// cons = nil
// }
// return cons
// }
// computation := func() <-chan *obiseq.BioSequence {
// result := make(chan *obiseq.BioSequence)
// go func() {
// select {
// case <-ctx.Done():
// result <- nil
// default:
// result <- consensus()
// }
// }()
// return result
// }
// calcResult := computation()
// select {
// case result := <-calcResult:
// if result == nil {
// return nil, fmt.Errorf("cannot compute consensus")
// }
// return result, nil
// case <-ctx.Done():
// return nil, fmt.Errorf("compute consensus timeout, exiting")
// }
// }
func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
newIter := obiiter.MakeIBioSequence()
size := 10
@@ -153,10 +258,19 @@ func Consensus(iterator obiiter.IBioSequence) obiiter.IBioSequence {
for iterator.Next() {
seqs := iterator.Get()
consensus, err := BuildConsensus(seqs.Slice(),
sequences := seqs.Slice()
id := sequences[0].Source()
if id == "" {
id = sequences[0].Id()
}
consensus, err := BuildConsensus(sequences,
id,
CLIKmerSize(), CLIThreshold(),
CLIKmerDepth(),
CLISaveGraphToFiles(), CLIGraphFilesDirectory(),
CLIMaxConsensusLength(),
CLISaveGraphToFiles(),
CLIGraphFilesDirectory(),
)
if err == nil {

View File

@@ -9,6 +9,7 @@ var _saveGraph = "__@@NOSAVE@@__"
var _kmerSize = -1
var _threshold = 0.99
var _mindepth = -1.0
var _consensus_max_length = -1
func ObiconsensusOptionSet(options *getoptions.GetOpt) {
@@ -38,6 +39,12 @@ func ObiconsensusOptionSet(options *getoptions.GetOpt) {
"Default value = -1, which means that the DEPTH is estimated from the data"),
)
options.IntVar(&_consensus_max_length, "consensus-max-length", _consensus_max_length,
options.ArgName("LENGTH"),
options.Description("Maximum length of the consensus sequence. "+
"Default value = -1, which means that no limit is applied"),
)
}
func OptionSet(options *getoptions.GetOpt) {
@@ -67,3 +74,7 @@ func CLIKmerDepth() float64 {
func CLIThreshold() float64 {
return _threshold
}
func CLIMaxConsensusLength() int {
return _consensus_max_length
}