mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-12-08 16:50:27 +00:00
First attempt for obiconsensus... The graph traversing algorithm is too simple
Former-commit-id: 0456e6c7fd55d6d0fcf9856c40386b976b912cba
This commit is contained in:
123
pkg/obitools/obiconsensus/obiconsensus.go
Normal file
123
pkg/obitools/obiconsensus/obiconsensus.go
Normal file
@@ -0,0 +1,123 @@
|
||||
package obiconsensus
|
||||
|
||||
import (
|
||||
"sort"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obikmer"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obisuffix"
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||
)
|
||||
|
||||
func BuildConsensus(seqs obiseq.BioSequenceSlice, quorum float64) (*obiseq.BioSequence, error) {
|
||||
|
||||
log.Printf("Number of reads : %d\n", len(seqs))
|
||||
|
||||
longest := make([]int, len(seqs))
|
||||
|
||||
for i := range seqs {
|
||||
s := seqs[i : i+1]
|
||||
sa := obisuffix.BuildSuffixArray(&s)
|
||||
longest[i] = obiutils.MaxSlice(sa.CommonSuffix())
|
||||
}
|
||||
|
||||
o := obiutils.Order(sort.IntSlice(longest))
|
||||
i := int(float64(len(seqs)) * quorum)
|
||||
|
||||
kmersize := longest[o[i]] + 1
|
||||
log.Printf("estimated kmer size : %d", kmersize)
|
||||
|
||||
graph := obikmer.MakeDeBruijnGraph(kmersize)
|
||||
|
||||
for _, s := range seqs {
|
||||
graph.Push(s)
|
||||
}
|
||||
|
||||
log.Printf("Graph size : %d\n", graph.Len())
|
||||
total_kmer := graph.Len()
|
||||
spectrum := graph.LinkSpectrum()
|
||||
cum := make(map[int]int)
|
||||
|
||||
spectrum[1]=0
|
||||
for i := 2; i < len(spectrum); i++ {
|
||||
spectrum[i] += spectrum[i-1]
|
||||
cum[spectrum[i]]++
|
||||
}
|
||||
|
||||
max := 0
|
||||
kmax := 0
|
||||
for k, obs := range cum {
|
||||
if obs > max {
|
||||
max = obs
|
||||
kmax = k
|
||||
}
|
||||
}
|
||||
|
||||
threshold := 0
|
||||
for i, total := range spectrum {
|
||||
if total == kmax {
|
||||
threshold = i
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
graph.FilterMin(threshold)
|
||||
log.Printf("Graph size : %d\n", graph.Len())
|
||||
|
||||
seq,err := graph.LongestConsensus(seqs[0].Source())
|
||||
|
||||
seq.SetCount(len(seqs))
|
||||
seq.SetAttribute("seq_length",seq.Len())
|
||||
seq.SetAttribute("kmer_size",kmersize)
|
||||
seq.SetAttribute("kmer_min_occur",threshold)
|
||||
seq.SetAttribute("kmer_max_occur",graph.MaxLink())
|
||||
seq.SetAttribute("filtered_graph_size",graph.Len())
|
||||
seq.SetAttribute("full_graph_size",total_kmer)
|
||||
|
||||
return seq,err
|
||||
}
|
||||
|
||||
func Consensus(iterator obiiter.IBioSequence, quorum float64) obiiter.IBioSequence {
|
||||
newIter := obiiter.MakeIBioSequence()
|
||||
size:=10
|
||||
|
||||
newIter.Add(1)
|
||||
|
||||
go func() {
|
||||
newIter.WaitAndClose()
|
||||
}()
|
||||
|
||||
go func() {
|
||||
order := 0
|
||||
iterator = iterator.SortBatches()
|
||||
buffer := obiseq.MakeBioSequenceSlice()
|
||||
|
||||
for iterator.Next() {
|
||||
seqs := iterator.Get()
|
||||
consensus,err := BuildConsensus(seqs.Slice(),quorum)
|
||||
|
||||
if err == nil {
|
||||
buffer = append(buffer, consensus)
|
||||
}
|
||||
|
||||
if len(buffer) == size {
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(order, buffer))
|
||||
order++
|
||||
buffer = obiseq.MakeBioSequenceSlice()
|
||||
}
|
||||
seqs.Recycle()
|
||||
}
|
||||
|
||||
if len(buffer) > 0 {
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(order, buffer))
|
||||
}
|
||||
|
||||
newIter.Done()
|
||||
|
||||
}()
|
||||
|
||||
return newIter
|
||||
}
|
||||
13
pkg/obitools/obiconsensus/options.go
Normal file
13
pkg/obitools/obiconsensus/options.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package obiconsensus
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.InputOptionSet(options)
|
||||
obiconvert.OutputOptionSet(options)
|
||||
}
|
||||
|
||||
@@ -30,6 +30,8 @@ var __compressed__ = false
|
||||
var __output_file_name__ = "-"
|
||||
var __paired_file_name__ = ""
|
||||
|
||||
var __full_file_batch__ = false
|
||||
|
||||
func InputOptionSet(options *getoptions.GetOpt) {
|
||||
// options.IntVar(&__skipped_entries__, "skip", __skipped_entries__,
|
||||
// options.Description("The N first sequence records of the file are discarded from the analysis and not reported to the output file."))
|
||||
@@ -201,3 +203,10 @@ func CLIHasPairedFile() bool {
|
||||
func CLIPairedFileName() string {
|
||||
return __paired_file_name__
|
||||
}
|
||||
|
||||
func SetFullFileBatch() {
|
||||
__full_file_batch__ = true
|
||||
}
|
||||
func FullFileBatch() bool {
|
||||
return __full_file_batch__
|
||||
}
|
||||
@@ -99,9 +99,13 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
||||
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
|
||||
|
||||
opts = append(opts, obiformats.OptionsQualityShift(CLIInputQualityShift()))
|
||||
opts = append(opts, obiformats.OptionsFullFileBatch(FullFileBatch()))
|
||||
|
||||
|
||||
if len(filenames) == 0 {
|
||||
log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat())
|
||||
opts = append(opts, obiformats.OptionsSource("stdin"))
|
||||
|
||||
switch CLIInputFormat() {
|
||||
case "ecopcr":
|
||||
iterator = obiformats.ReadEcoPCR(os.Stdin, opts...)
|
||||
@@ -121,7 +125,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
||||
|
||||
switch CLIInputFormat() {
|
||||
case "ecopcr":
|
||||
reader = obiformats.ReadEcoPCRBatchFromFile
|
||||
reader = obiformats.ReadEcoPCRFromFile
|
||||
case "embl":
|
||||
reader = obiformats.ReadEMBLFromFile
|
||||
case "genbank":
|
||||
|
||||
Reference in New Issue
Block a user