First attempt for obiconsensus... The graph traversing algorithm is too simple

Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba
This commit is contained in:
2023-03-27 19:51:10 +07:00
parent d5e84ec676
commit a33e471b39
17 changed files with 868 additions and 23 deletions

View File

@@ -0,0 +1,123 @@
package obiconsensus
import (
"sort"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obisuffix"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
)
func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSequence, error) {
log.Printf("Number of reads : %d\n", len(seqs))
longest := make([]int, len(seqs))
for i := range seqs {
s := seqs[i : i+1]
sa := obisuffix.BuildSuffixArray(&s)
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
}
o := obiutils.Order(sort.IntSlice(longest))
i := int(float64(len(seqs)) * quorum)
kmersize := longest[o[i]] + 1
log.Printf("estimated kmer size : %d", kmersize)
graph := obikmer.MakeDeBruijnGraph(kmersize)
for _, s := range seqs {
graph.Push(s)
}
log.Printf("Graph size : %d\n", graph.Len())
total_kmer := graph.Len()
spectrum := graph.LinkSpectrum()
cum := make(map[int]int)
spectrum[1]=0
for i := 2; i < len(spectrum); i++ {
spectrum[i] += spectrum[i-1]
cum[spectrum[i]]++
}
max := 0
kmax := 0
for k, obs := range cum {
if obs > max {
max = obs
kmax = k
}
}
threshold := 0
for i, total := range spectrum {
if total == kmax {
threshold = i
break
}
}
graph.FilterMin(threshold)
log.Printf("Graph size : %d\n", graph.Len())
seq,err := graph.LongestConsensus(seqs[0].Source())
seq.SetCount(len(seqs))
seq.SetAttribute("seq_length",seq.Len())
seq.SetAttribute("kmer_size",kmersize)
seq.SetAttribute("kmer_min_occur",threshold)
seq.SetAttribute("kmer_max_occur",graph.MaxLink())
seq.SetAttribute("filtered_graph_size",graph.Len())
seq.SetAttribute("full_graph_size",total_kmer)
return seq,err
}
func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequence {
newIter := obiiter.MakeIBioSequence()
size:=10
newIter.Add(1)
go func() {
newIter.WaitAndClose()
}()
go func() {
order := 0
iterator = iterator.SortBatches()
buffer := obiseq.MakeBioSequenceSlice()
for iterator.Next() {
seqs := iterator.Get()
consensus,err := BuildConsensus(seqs.Slice(),quorum)
if err == nil {
buffer = append(buffer, consensus)
}
if len(buffer) == size {
newIter.Push(obiiter.MakeBioSequenceBatch(order, buffer))
order++
buffer = obiseq.MakeBioSequenceSlice()
}
seqs.Recycle()
}
if len(buffer) > 0 {
newIter.Push(obiiter.MakeBioSequenceBatch(order, buffer))
}
newIter.Done()
}()
return newIter
}

View File

@@ -0,0 +1,13 @@
package obiconsensus
import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options)
obiconvert.OutputOptionSet(options)
}

View File

@@ -30,6 +30,8 @@ var __compressed__ = false
var __output_file_name__ = "-"
var __paired_file_name__ = ""
var __full_file_batch__ = false
func InputOptionSet(options *getoptions.GetOpt) {
// options.IntVar(&__skipped_entries__, "skip", __skipped_entries__,
// options.Description("The N first sequence records of the file are discarded from the analysis and not reported to the output file."))
@@ -201,3 +203,10 @@ func CLIHasPairedFile() bool {
func CLIPairedFileName() string {
return __paired_file_name__
}
func SetFullFileBatch() {
__full_file_batch__ = true
}
func FullFileBatch() bool {
return __full_file_batch__
}

View File

@@ -99,9 +99,13 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
opts = append(opts, obiformats.OptionsQualityShift(CLIInputQualityShift()))
opts = append(opts, obiformats.OptionsFullFileBatch(FullFileBatch()))
if len(filenames) == 0 {
log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat())
opts = append(opts, obiformats.OptionsSource("stdin"))
switch CLIInputFormat() {
case "ecopcr":
iterator = obiformats.ReadEcoPCR(os.Stdin, opts...)
@@ -121,7 +125,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
switch CLIInputFormat() {
case "ecopcr":
reader = obiformats.ReadEcoPCRBatchFromFile
reader = obiformats.ReadEcoPCRFromFile
case "embl":
reader = obiformats.ReadEMBLFromFile
case "genbank":