mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
Change obiclean algorithm for a better evaluation of ratio
This commit is contained in:
@ -19,6 +19,14 @@ func AnnotatorToSeqWorker(function SeqAnnotator) SeqWorker {
|
|||||||
return f
|
return f
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// That method allows for applying a SeqWorker function on every sequences.
|
||||||
|
//
|
||||||
|
// Sequences are provided by the iterator and modified sequences are pushed
|
||||||
|
// on the returned IBioSequenceBatch.
|
||||||
|
//
|
||||||
|
// Moreover the SeqWorker function, the method accepted two optional integer parameters.
|
||||||
|
// - First is allowing to indicates the number of workers running in parallele (default 4)
|
||||||
|
// - The second the size of the chanel buffer. By default set to the same value than the input buffer.
|
||||||
func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequenceBatch {
|
func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IBioSequenceBatch {
|
||||||
nworkers := 4
|
nworkers := 4
|
||||||
buffsize := iterator.BufferSize()
|
buffsize := iterator.BufferSize()
|
||||||
@ -61,6 +69,51 @@ func (iterator IBioSequenceBatch) MakeIWorker(worker SeqWorker, sizes ...int) IB
|
|||||||
return newIter
|
return newIter
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (iterator IBioSequenceBatch) MakeIConditionalWorker(predicate obiseq.SequencePredicate,
|
||||||
|
worker SeqWorker, sizes ...int) IBioSequenceBatch {
|
||||||
|
nworkers := 4
|
||||||
|
buffsize := iterator.BufferSize()
|
||||||
|
|
||||||
|
if len(sizes) > 0 {
|
||||||
|
nworkers = sizes[0]
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(sizes) > 1 {
|
||||||
|
buffsize = sizes[1]
|
||||||
|
}
|
||||||
|
|
||||||
|
newIter := MakeIBioSequenceBatch(buffsize)
|
||||||
|
|
||||||
|
newIter.Add(nworkers)
|
||||||
|
|
||||||
|
go func() {
|
||||||
|
newIter.WaitAndClose()
|
||||||
|
log.Debugln("End of the batch workers")
|
||||||
|
|
||||||
|
}()
|
||||||
|
|
||||||
|
f := func(iterator IBioSequenceBatch) {
|
||||||
|
for iterator.Next() {
|
||||||
|
batch := iterator.Get()
|
||||||
|
for i, seq := range batch.slice {
|
||||||
|
if predicate(batch.slice[i]) {
|
||||||
|
batch.slice[i] = worker(seq)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
newIter.Push(batch)
|
||||||
|
}
|
||||||
|
newIter.Done()
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Debugln("Start of the batch workers")
|
||||||
|
for i := 0; i < nworkers-1; i++ {
|
||||||
|
go f(iterator.Split())
|
||||||
|
}
|
||||||
|
go f(iterator)
|
||||||
|
|
||||||
|
return newIter
|
||||||
|
}
|
||||||
|
|
||||||
func (iterator IBioSequenceBatch) MakeISliceWorker(worker SeqSliceWorker, sizes ...int) IBioSequenceBatch {
|
func (iterator IBioSequenceBatch) MakeISliceWorker(worker SeqSliceWorker, sizes ...int) IBioSequenceBatch {
|
||||||
nworkers := 4
|
nworkers := 4
|
||||||
buffsize := iterator.BufferSize()
|
buffsize := iterator.BufferSize()
|
||||||
|
@ -199,7 +199,7 @@ func (s *BioSequence) Annotations() Annotation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// A method that returns the value of the key in the annotation map.
|
// A method that returns the value of the key in the annotation map.
|
||||||
func (s *BioSequence) Get(key string) (interface{}, bool) {
|
func (s *BioSequence) GetAttribute(key string) (interface{}, bool) {
|
||||||
var val interface{}
|
var val interface{}
|
||||||
ok := s.annotations != nil
|
ok := s.annotations != nil
|
||||||
|
|
||||||
@ -210,12 +210,17 @@ func (s *BioSequence) Get(key string) (interface{}, bool) {
|
|||||||
return val, ok
|
return val, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *BioSequence) SetAttribute(key string, value interface{}) {
|
||||||
|
annot := s.Annotations()
|
||||||
|
annot[key] = value
|
||||||
|
}
|
||||||
|
|
||||||
// A method that returns the value of the key in the annotation map.
|
// A method that returns the value of the key in the annotation map.
|
||||||
func (s *BioSequence) GetInt(key string) (int, bool) {
|
func (s *BioSequence) GetIntAttribute(key string) (int, bool) {
|
||||||
var val int
|
var val int
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
v, ok := s.Get(key)
|
v, ok := s.GetAttribute(key)
|
||||||
|
|
||||||
if ok {
|
if ok {
|
||||||
val, err = goutils.InterfaceToInt(v)
|
val, err = goutils.InterfaceToInt(v)
|
||||||
@ -226,9 +231,9 @@ func (s *BioSequence) GetInt(key string) (int, bool) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// A method that returns the value of the key in the annotation map.
|
// A method that returns the value of the key in the annotation map.
|
||||||
func (s *BioSequence) GetString(key string) (string, bool) {
|
func (s *BioSequence) GetStringAttribute(key string) (string, bool) {
|
||||||
var val string
|
var val string
|
||||||
v, ok := s.Get(key)
|
v, ok := s.GetAttribute(key)
|
||||||
|
|
||||||
if ok {
|
if ok {
|
||||||
val = fmt.Sprint(v)
|
val = fmt.Sprint(v)
|
||||||
@ -242,7 +247,7 @@ func (s *BioSequence) GetBool(key string) (bool, bool) {
|
|||||||
var val bool
|
var val bool
|
||||||
var err error
|
var err error
|
||||||
|
|
||||||
v, ok := s.Get(key)
|
v, ok := s.GetAttribute(key)
|
||||||
|
|
||||||
if ok {
|
if ok {
|
||||||
val, err = goutils.InterfaceToBool(v)
|
val, err = goutils.InterfaceToBool(v)
|
||||||
@ -259,7 +264,7 @@ func (s *BioSequence) MD5() [16]byte {
|
|||||||
|
|
||||||
// Returning the number of times the sequence has been observed.
|
// Returning the number of times the sequence has been observed.
|
||||||
func (s *BioSequence) Count() int {
|
func (s *BioSequence) Count() int {
|
||||||
count, ok := s.GetInt("count")
|
count, ok := s.GetIntAttribute("count")
|
||||||
|
|
||||||
if !ok {
|
if !ok {
|
||||||
count = 1
|
count = 1
|
||||||
@ -270,7 +275,7 @@ func (s *BioSequence) Count() int {
|
|||||||
|
|
||||||
// Returning the taxid of the sequence.
|
// Returning the taxid of the sequence.
|
||||||
func (s *BioSequence) Taxid() int {
|
func (s *BioSequence) Taxid() int {
|
||||||
taxid, ok := s.GetInt("taxid")
|
taxid, ok := s.GetIntAttribute("taxid")
|
||||||
|
|
||||||
if !ok {
|
if !ok {
|
||||||
taxid = 1
|
taxid = 1
|
||||||
@ -330,6 +335,12 @@ func (s *BioSequence) WriteByteQualities(data byte) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clearing the sequence.
|
||||||
|
func (s *BioSequence) ClearQualities() {
|
||||||
|
s.qualities = s.qualities[0:0]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// A method that appends a byte slice to the sequence.
|
// A method that appends a byte slice to the sequence.
|
||||||
func (s *BioSequence) Write(data []byte) (int, error) {
|
func (s *BioSequence) Write(data []byte) (int, error) {
|
||||||
s.sequence = append(s.sequence, data...)
|
s.sequence = append(s.sequence, data...)
|
||||||
@ -347,3 +358,8 @@ func (s *BioSequence) WriteByte(data byte) error {
|
|||||||
s.sequence = append(s.sequence, data)
|
s.sequence = append(s.sequence, data)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Clearing the sequence.
|
||||||
|
func (s *BioSequence) Clear() {
|
||||||
|
s.sequence = s.sequence[0:0]
|
||||||
|
}
|
||||||
|
@ -2,6 +2,8 @@ package obitax
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (taxon *TaxNode) Path() (*TaxonSlice, error) {
|
func (taxon *TaxNode) Path() (*TaxonSlice, error) {
|
||||||
@ -22,6 +24,34 @@ func (taxon *TaxNode) Path() (*TaxonSlice, error) {
|
|||||||
return &path, nil
|
return &path, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (taxon *TaxNode) TaxonAtRank(rank string) *TaxNode {
|
||||||
|
for taxon.rank != rank && taxon != taxon.pparent {
|
||||||
|
taxon = taxon.pparent
|
||||||
|
|
||||||
|
if taxon == nil {
|
||||||
|
log.Panicln("Taxonomy must be reindexed")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if taxon == taxon.pparent {
|
||||||
|
taxon = nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return taxon
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxon *TaxNode) Species() *TaxNode {
|
||||||
|
return taxon.TaxonAtRank("species")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxon *TaxNode) Genus() *TaxNode {
|
||||||
|
return taxon.TaxonAtRank("genus")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxon *TaxNode) Family() *TaxNode {
|
||||||
|
return taxon.TaxonAtRank("family")
|
||||||
|
}
|
||||||
|
|
||||||
// Returns a TaxonSet listing the requested taxon and all
|
// Returns a TaxonSet listing the requested taxon and all
|
||||||
// its ancestors in the taxonomy down to the root.
|
// its ancestors in the taxonomy down to the root.
|
||||||
func (taxonomy *Taxonomy) Path(taxid int) (*TaxonSlice, error) {
|
func (taxonomy *Taxonomy) Path(taxid int) (*TaxonSlice, error) {
|
||||||
|
37
pkg/obitax/sequence_methods.go
Normal file
37
pkg/obitax/sequence_methods.go
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
package obitax
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Setting the taxon at a given rank for a given sequence.
|
||||||
|
//
|
||||||
|
// Two attributes are added to the sequence. One named by the rank name stores
|
||||||
|
// the taxid, a second named by the rank name suffixed with '_name' contains the
|
||||||
|
// Scientific name of the genus.
|
||||||
|
// If the taxon at the given rank doesn't exist for the taxonomy annotation
|
||||||
|
// of the sequence, nothing happens.
|
||||||
|
func (taxonomy *Taxonomy) SetTaxonAtRank(sequence *obiseq.BioSequence, rank string) *TaxNode {
|
||||||
|
taxid := sequence.Taxid()
|
||||||
|
taxon, err := taxonomy.Taxon(taxid)
|
||||||
|
taxonAtRank := taxon.TaxonAtRank(rank)
|
||||||
|
|
||||||
|
if err == nil && taxonAtRank != nil {
|
||||||
|
sequence.SetAttribute(rank, taxonAtRank.taxid)
|
||||||
|
sequence.SetAttribute(rank+"_name", taxonAtRank.scientificname)
|
||||||
|
}
|
||||||
|
|
||||||
|
return taxonAtRank
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxonomy *Taxonomy) SetSpecies(sequence *obiseq.BioSequence) *TaxNode {
|
||||||
|
return taxonomy.SetTaxonAtRank(sequence, "species")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxonomy *Taxonomy) SetGenus(sequence *obiseq.BioSequence) *TaxNode {
|
||||||
|
return taxonomy.SetTaxonAtRank(sequence, "genus")
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxonomy *Taxonomy) SetFamily(sequence *obiseq.BioSequence) *TaxNode {
|
||||||
|
return taxonomy.SetTaxonAtRank(sequence, "family")
|
||||||
|
}
|
@ -22,7 +22,10 @@ func (taxonomy *Taxonomy) IsAValidTaxon(withAutoCorrection ...bool) obiseq.Seque
|
|||||||
if err == nil && taxon.taxid != taxid {
|
if err == nil && taxon.taxid != taxid {
|
||||||
if autocorrection {
|
if autocorrection {
|
||||||
sequence.SetTaxid(taxon.taxid)
|
sequence.SetTaxid(taxon.taxid)
|
||||||
log.Printf("Sequence %s : Taxid %d updated with %d", taxid, taxon.taxid)
|
log.Printf("Sequence %s : Taxid %d updated with %d",
|
||||||
|
sequence.Id(),
|
||||||
|
taxid,
|
||||||
|
taxon.taxid)
|
||||||
} else {
|
} else {
|
||||||
if _, ok := deprecatedTaxidsWarning[taxid]; !ok {
|
if _, ok := deprecatedTaxidsWarning[taxid]; !ok {
|
||||||
deprecatedTaxidsWarning[taxid] = true
|
deprecatedTaxidsWarning[taxid] = true
|
||||||
|
56
pkg/obitax/sequence_workers.go
Normal file
56
pkg/obitax/sequence_workers.go
Normal file
@ -0,0 +1,56 @@
|
|||||||
|
package obitax
|
||||||
|
|
||||||
|
import (
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/goutils"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
|
log "github.com/sirupsen/logrus"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (taxonomy *Taxonomy) MakeSetTaxonAtRankWorker(rank string) obiiter.SeqWorker {
|
||||||
|
|
||||||
|
if !goutils.Contains(taxonomy.RankList(), rank) {
|
||||||
|
log.Fatalf("%s is not a valid rank (allowed ranks are %v)",
|
||||||
|
rank,
|
||||||
|
taxonomy.RankList())
|
||||||
|
}
|
||||||
|
|
||||||
|
w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||||
|
taxonomy.SetTaxonAtRank(sequence, rank)
|
||||||
|
return sequence
|
||||||
|
}
|
||||||
|
|
||||||
|
return w
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxonomy *Taxonomy) MakeSetSpeciesWorker() obiiter.SeqWorker {
|
||||||
|
|
||||||
|
w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||||
|
taxonomy.SetSpecies(sequence)
|
||||||
|
return sequence
|
||||||
|
}
|
||||||
|
|
||||||
|
return w
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxonomy *Taxonomy) MakeSetGenusWorker() obiiter.SeqWorker {
|
||||||
|
|
||||||
|
w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||||
|
taxonomy.SetGenus(sequence)
|
||||||
|
return sequence
|
||||||
|
}
|
||||||
|
|
||||||
|
return w
|
||||||
|
}
|
||||||
|
|
||||||
|
func (taxonomy *Taxonomy) MakeSetFamilyWorker() obiiter.SeqWorker {
|
||||||
|
|
||||||
|
w := func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||||
|
taxonomy.SetFamily(sequence)
|
||||||
|
return sequence
|
||||||
|
}
|
||||||
|
|
||||||
|
return w
|
||||||
|
}
|
||||||
|
|
||||||
|
|
@ -17,12 +17,35 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type Ratio struct {
|
type Ratio struct {
|
||||||
|
Sample string
|
||||||
From int
|
From int
|
||||||
To int
|
To int
|
||||||
|
CFrom int
|
||||||
|
CTo int
|
||||||
Pos int
|
Pos int
|
||||||
Length int
|
Length int
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type Edge struct {
|
||||||
|
Father int
|
||||||
|
From byte
|
||||||
|
To byte
|
||||||
|
Pos int
|
||||||
|
NucPair int
|
||||||
|
Dist int
|
||||||
|
}
|
||||||
|
|
||||||
|
func makeEdge(father, dist, pos int, from, to byte) Edge {
|
||||||
|
return Edge{
|
||||||
|
Father: father,
|
||||||
|
Dist: dist,
|
||||||
|
Pos: pos,
|
||||||
|
From: from,
|
||||||
|
To: to,
|
||||||
|
NucPair: nucPair(from, to),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func abs(x int) int {
|
func abs(x int) int {
|
||||||
if x < 0 {
|
if x < 0 {
|
||||||
return -x
|
return -x
|
||||||
@ -74,11 +97,19 @@ func EmpiricalDistCsv(filename string, data [][]Ratio) {
|
|||||||
|
|
||||||
bar := progressbar.NewOptions(len(data), pbopt...)
|
bar := progressbar.NewOptions(len(data), pbopt...)
|
||||||
|
|
||||||
fmt.Fprintln(file, "From,To,Count_from,Count_to,Position,length")
|
fmt.Fprintln(file, "Sample,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
|
||||||
for code, dist := range data {
|
for code, dist := range data {
|
||||||
a1, a2 := intToNucPair(code)
|
a1, a2 := intToNucPair(code)
|
||||||
for _, ratio := range dist {
|
for _, ratio := range dist {
|
||||||
fmt.Fprintf(file, "%c,%c,%d,%d,%d,%d\n", a1, a2, ratio.From, ratio.To, ratio.Pos, ratio.Length)
|
fmt.Fprintf(file, "%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
|
||||||
|
ratio.Sample,
|
||||||
|
a1, a2,
|
||||||
|
ratio.From,
|
||||||
|
ratio.To,
|
||||||
|
ratio.CFrom,
|
||||||
|
ratio.CTo,
|
||||||
|
ratio.Pos,
|
||||||
|
ratio.Length)
|
||||||
}
|
}
|
||||||
bar.Add(1)
|
bar.Add(1)
|
||||||
}
|
}
|
||||||
@ -94,11 +125,11 @@ func Gml(seqs *[]*seqPCR, sample string, statThreshold int) string {
|
|||||||
comment "Obiclean graph for sample {{ Name }}"
|
comment "Obiclean graph for sample {{ Name }}"
|
||||||
directed 1
|
directed 1
|
||||||
{{range $index, $data:= .}}
|
{{range $index, $data:= .}}
|
||||||
{{ if or $data.Fathers (gt $data.SonCount 0)}}
|
{{ if or $data.Edges (gt $data.SonCount 0)}}
|
||||||
node [ id {{$index}}
|
node [ id {{$index}}
|
||||||
graphics [
|
graphics [
|
||||||
type "{{ Shape $data.Count }}"
|
type "{{ Shape $data.Count }}"
|
||||||
fill "{{ if and (gt $data.SonCount 0) (not $data.Fathers)}}#0000FF{{ else }}#00FF00{{ end }}"
|
fill "{{ if and (gt $data.SonCount 0) (not $data.Edges)}}#0000FF{{ else }}#00FF00{{ end }}"
|
||||||
h {{ Sqrt $data.Count }}
|
h {{ Sqrt $data.Count }}
|
||||||
w {{ Sqrt $data.Count }}
|
w {{ Sqrt $data.Count }}
|
||||||
]
|
]
|
||||||
@ -108,11 +139,11 @@ func Gml(seqs *[]*seqPCR, sample string, statThreshold int) string {
|
|||||||
{{ end }}
|
{{ end }}
|
||||||
|
|
||||||
{{range $index, $data:= .}}
|
{{range $index, $data:= .}}
|
||||||
{{range $i, $father:= $data.Fathers}}
|
{{range $i, $edge:= $data.Edges}}
|
||||||
edge [ source {{$index}}
|
edge [ source {{$index}}
|
||||||
target {{$father}}
|
target {{$edge.Father}}
|
||||||
color "{{ if gt (index $data.Dist $i) 1 }}#FF0000{{ else }}#00FF00{{ end }}"
|
color "{{ if gt (index $data.Edges $i).Dist 1 }}#FF0000{{ else }}#00FF00{{ end }}"
|
||||||
label "{{(index $data.Dist $i)}}"
|
label "{{(index $data.Edges $i).Dist}}"
|
||||||
]
|
]
|
||||||
{{ end }}
|
{{ end }}
|
||||||
{{ end }}
|
{{ end }}
|
||||||
@ -226,51 +257,83 @@ func intToNucPair(code int) (a, b byte) {
|
|||||||
return decode[c1], decode[c2]
|
return decode[c1], decode[c2]
|
||||||
}
|
}
|
||||||
|
|
||||||
func buildSamplePairs(seqs *[]*seqPCR, minStatRatio int, workers int) ([][]Ratio, int) {
|
func reweightSequences(seqs *[]*seqPCR) {
|
||||||
|
|
||||||
|
for _, node := range *seqs {
|
||||||
|
node.Weight = node.Count
|
||||||
|
}
|
||||||
|
|
||||||
|
//var rfunc func(*seqPCR)
|
||||||
|
|
||||||
|
rfunc := func(node *seqPCR) {
|
||||||
|
node.AddedSons=0
|
||||||
|
nedges := len(node.Edges)
|
||||||
|
if nedges > 0 {
|
||||||
|
swf := 0.0
|
||||||
|
|
||||||
|
for k := 0; k < nedges; k++ {
|
||||||
|
swf += float64((*seqs)[node.Edges[k].Father].Count)
|
||||||
|
}
|
||||||
|
|
||||||
|
for k := 0; k < nedges; k++ {
|
||||||
|
father := (*seqs)[node.Edges[k].Father]
|
||||||
|
father.Weight += int(math.Round(float64(node.Weight) * float64(father.Count) / swf))
|
||||||
|
father.AddedSons++
|
||||||
|
// log.Println(father.AddedSons, father.SonCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, node := range *seqs {
|
||||||
|
if node.SonCount == 0 {
|
||||||
|
rfunc(node)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for done := true; done; {
|
||||||
|
done = false
|
||||||
|
for _, node := range *seqs {
|
||||||
|
if node.SonCount > 0 && node.SonCount == node.AddedSons {
|
||||||
|
// log.Println(node.AddedSons, node.SonCount)
|
||||||
|
rfunc(node)
|
||||||
|
done = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildSamplePairs(seqs *[]*seqPCR, workers int) int {
|
||||||
nseq := len(*seqs)
|
nseq := len(*seqs)
|
||||||
running := sync.WaitGroup{}
|
running := sync.WaitGroup{}
|
||||||
|
|
||||||
linePairs := func(i int) [][]Ratio {
|
linePairs := func(i int) {
|
||||||
|
|
||||||
ratio := make([][]Ratio, 25)
|
|
||||||
son := (*seqs)[i]
|
son := (*seqs)[i]
|
||||||
|
|
||||||
for j := i + 1; j < nseq; j++ {
|
for j := i + 1; j < nseq; j++ {
|
||||||
father := (*seqs)[j]
|
father := (*seqs)[j]
|
||||||
d, pos, a1, a2 := obialign.D1Or0(son.Sequence, father.Sequence)
|
d, pos, a1, a2 := obialign.D1Or0(son.Sequence, father.Sequence)
|
||||||
if d > 0 {
|
if d > 0 {
|
||||||
son.Fathers = append(son.Fathers, j)
|
son.Edges = append(son.Edges, makeEdge(j, d, pos, a1, a2))
|
||||||
son.Dist = append(son.Dist, d)
|
|
||||||
father.SonCount++
|
father.SonCount++
|
||||||
|
|
||||||
if father.Count > minStatRatio {
|
|
||||||
n := nucPair(a1, a2)
|
|
||||||
ratio[n] = append(ratio[n], Ratio{father.Count, son.Count, pos, father.Sequence.Length()})
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return ratio
|
|
||||||
}
|
}
|
||||||
|
|
||||||
lineChan := make(chan int)
|
lineChan := make(chan int)
|
||||||
idxChan := make(chan [][]Ratio)
|
|
||||||
|
|
||||||
ff := func() {
|
ff := func() {
|
||||||
for i := range lineChan {
|
for i := range lineChan {
|
||||||
idxChan <- linePairs(i)
|
linePairs(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
running.Done()
|
running.Done()
|
||||||
}
|
}
|
||||||
|
|
||||||
running.Add(workers)
|
running.Add(workers)
|
||||||
|
|
||||||
go func() {
|
|
||||||
running.Wait()
|
|
||||||
close(idxChan)
|
|
||||||
}()
|
|
||||||
|
|
||||||
for i := 0; i < workers; i++ {
|
for i := 0; i < workers; i++ {
|
||||||
go ff()
|
go ff()
|
||||||
}
|
}
|
||||||
@ -283,15 +346,12 @@ func buildSamplePairs(seqs *[]*seqPCR, minStatRatio int, workers int) ([][]Ratio
|
|||||||
}()
|
}()
|
||||||
|
|
||||||
np := nseq * (nseq - 1) / 2
|
np := nseq * (nseq - 1) / 2
|
||||||
ratio := make([][]Ratio, 25)
|
|
||||||
for data := range idxChan {
|
|
||||||
for i, r := range data {
|
|
||||||
ratio[i] = append(ratio[i], r...)
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
running.Wait()
|
||||||
|
|
||||||
return ratio, np
|
reweightSequences(seqs)
|
||||||
|
|
||||||
|
return np
|
||||||
}
|
}
|
||||||
|
|
||||||
func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int {
|
func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int {
|
||||||
@ -310,8 +370,7 @@ func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int {
|
|||||||
matrix)
|
matrix)
|
||||||
d := (lali - lcs)
|
d := (lali - lcs)
|
||||||
if lcs >= 0 && d <= step && step > 0 {
|
if lcs >= 0 && d <= step && step > 0 {
|
||||||
son.Fathers = append(son.Fathers, j)
|
son.Edges = append(son.Edges, makeEdge(j, d, -1, '-', '-'))
|
||||||
son.Dist = append(son.Dist, d)
|
|
||||||
father.SonCount++
|
father.SonCount++
|
||||||
//a, b := minMax((*seqs)[i].Count, (*seqs)[j].Count)
|
//a, b := minMax((*seqs)[i].Count, (*seqs)[j].Count)
|
||||||
}
|
}
|
||||||
@ -340,7 +399,7 @@ func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int {
|
|||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
for i := 0; i < nseq; i++ {
|
for i := 0; i < nseq; i++ {
|
||||||
if len((*seqs)[i].Fathers) == 0 {
|
if len((*seqs)[i].Edges) == 0 {
|
||||||
lineChan <- i
|
lineChan <- i
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -354,21 +413,18 @@ func extendSimilarityGraph(seqs *[]*seqPCR, step int, workers int) int {
|
|||||||
|
|
||||||
func FilterGraphOnRatio(seqs *[]*seqPCR, ratio float64) {
|
func FilterGraphOnRatio(seqs *[]*seqPCR, ratio float64) {
|
||||||
for _, s1 := range *seqs {
|
for _, s1 := range *seqs {
|
||||||
c1 := float64(s1.Count)
|
c1 := float64(s1.Weight)
|
||||||
f := s1.Fathers
|
e := s1.Edges
|
||||||
d := s1.Dist
|
|
||||||
j := 0
|
j := 0
|
||||||
for i, s2 := range f {
|
for i, s2 := range e {
|
||||||
f[j] = f[i]
|
e[j] = e[i]
|
||||||
d[j] = d[i]
|
if (c1 / float64((*seqs)[s2.Father].Weight)) <= math.Pow(ratio, float64(e[i].Dist)) {
|
||||||
if (c1 / float64((*seqs)[s2].Count)) <= math.Pow(ratio, float64(d[i])) {
|
|
||||||
j++
|
j++
|
||||||
} else {
|
} else {
|
||||||
(*seqs)[s2].SonCount--
|
(*seqs)[s2.Father].SonCount--
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
s1.Fathers = f[0:j]
|
s1.Edges = e[0:j]
|
||||||
s1.Dist = d[0:j]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -384,7 +440,7 @@ func sortSamples(samples map[string]*([]*seqPCR)) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func ObicleanStatus(seq *seqPCR) string {
|
func ObicleanStatus(seq *seqPCR) string {
|
||||||
if len(seq.Fathers) == 0 {
|
if len(seq.Edges) == 0 {
|
||||||
if seq.SonCount > 0 {
|
if seq.SonCount > 0 {
|
||||||
return "h"
|
return "h"
|
||||||
} else {
|
} else {
|
||||||
@ -395,8 +451,28 @@ func ObicleanStatus(seq *seqPCR) string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
|
||||||
|
ratio := make([][]Ratio, 25)
|
||||||
|
|
||||||
|
for name, seqs := range samples {
|
||||||
|
|
||||||
|
for _, seq := range *seqs {
|
||||||
|
for _, edge := range seq.Edges {
|
||||||
|
father := (*seqs)[edge.Father]
|
||||||
|
if father.Weight >= minStatRatio && edge.Dist == 1 {
|
||||||
|
ratio[edge.NucPair] = append(ratio[edge.NucPair], Ratio{name, father.Weight, seq.Weight, father.Count, seq.Count, edge.Pos, father.Sequence.Length()})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ratio
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func BuildSeqGraph(samples map[string]*[]*seqPCR,
|
func BuildSeqGraph(samples map[string]*[]*seqPCR,
|
||||||
maxError, minStatRatio, workers int) [][]Ratio {
|
maxError, workers int) {
|
||||||
|
|
||||||
sortSamples(samples)
|
sortSamples(samples)
|
||||||
|
|
||||||
@ -416,13 +492,8 @@ func BuildSeqGraph(samples map[string]*[]*seqPCR,
|
|||||||
)
|
)
|
||||||
|
|
||||||
bar := progressbar.NewOptions(npairs, pbopt...)
|
bar := progressbar.NewOptions(npairs, pbopt...)
|
||||||
all_ratio := make([][]Ratio, 25)
|
|
||||||
for _, seqs := range samples {
|
for _, seqs := range samples {
|
||||||
ratio, np := buildSamplePairs(seqs, minStatRatio, workers)
|
np := buildSamplePairs(seqs, workers)
|
||||||
|
|
||||||
for i, r := range ratio {
|
|
||||||
all_ratio[i] = append(all_ratio[i], r...)
|
|
||||||
}
|
|
||||||
|
|
||||||
bar.Add(np)
|
bar.Add(np)
|
||||||
}
|
}
|
||||||
@ -444,6 +515,4 @@ func BuildSeqGraph(samples map[string]*[]*seqPCR,
|
|||||||
bar.Add(np)
|
bar.Add(np)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return all_ratio
|
|
||||||
}
|
}
|
||||||
|
@ -13,11 +13,12 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
type seqPCR struct {
|
type seqPCR struct {
|
||||||
Count int // number of reads associated to a sequence in a PCR
|
Count int // number of reads associated to a sequence in a PCR
|
||||||
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
|
Weight int // Number of reads associated to a sequence after clustering
|
||||||
SonCount int
|
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
|
||||||
Fathers []int
|
SonCount int
|
||||||
Dist []int
|
AddedSons int
|
||||||
|
Edges []Edge
|
||||||
}
|
}
|
||||||
|
|
||||||
// buildSamples sorts the sequences by samples
|
// buildSamples sorts the sequences by samples
|
||||||
@ -43,9 +44,10 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
|
|||||||
}
|
}
|
||||||
|
|
||||||
*pcr = append(*pcr, &seqPCR{
|
*pcr = append(*pcr, &seqPCR{
|
||||||
Count: v,
|
Count: v,
|
||||||
Sequence: s,
|
Sequence: s,
|
||||||
SonCount: 0,
|
SonCount: 0,
|
||||||
|
AddedSons: 0,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -181,6 +183,33 @@ func Status(sequence *obiseq.BioSequence) map[string]string {
|
|||||||
return obistatus
|
return obistatus
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Weight(sequence *obiseq.BioSequence) map[string]int {
|
||||||
|
annotation := sequence.Annotations()
|
||||||
|
iobistatus, ok := annotation["obiclean_weight"]
|
||||||
|
var weight map[string]int
|
||||||
|
var err error
|
||||||
|
|
||||||
|
if ok {
|
||||||
|
switch iobistatus := iobistatus.(type) {
|
||||||
|
case map[string]int:
|
||||||
|
weight = iobistatus
|
||||||
|
case map[string]interface{}:
|
||||||
|
weight = make(map[string]int)
|
||||||
|
for k, v := range iobistatus {
|
||||||
|
weight[k], err = goutils.InterfaceToInt(v)
|
||||||
|
if err != nil {
|
||||||
|
log.Panicf("Weight value %v cannnot be casted to an integer value\n", v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
weight = make(map[string]int)
|
||||||
|
annotation["obiclean_weight"] = weight
|
||||||
|
}
|
||||||
|
|
||||||
|
return weight
|
||||||
|
}
|
||||||
|
|
||||||
func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
||||||
|
|
||||||
db := itertator.Load()
|
db := itertator.Load()
|
||||||
@ -191,9 +220,8 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
|||||||
|
|
||||||
log.Infof("Dataset composed of %d samples\n", len(samples))
|
log.Infof("Dataset composed of %d samples\n", len(samples))
|
||||||
|
|
||||||
all_ratio := BuildSeqGraph(samples,
|
BuildSeqGraph(samples,
|
||||||
DistStepMax(),
|
DistStepMax(),
|
||||||
MinCountToEvalMutationRate(),
|
|
||||||
obioptions.CLIParallelWorkers())
|
obioptions.CLIParallelWorkers())
|
||||||
|
|
||||||
if RatioMax() < 1.0 {
|
if RatioMax() < 1.0 {
|
||||||
@ -215,6 +243,7 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if IsSaveRatioTable() {
|
if IsSaveRatioTable() {
|
||||||
|
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
|
||||||
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
|
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -237,6 +266,9 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
|
|||||||
for _, pcr := range *seqs {
|
for _, pcr := range *seqs {
|
||||||
obistatus := Status(pcr.Sequence)
|
obistatus := Status(pcr.Sequence)
|
||||||
obistatus[name] = ObicleanStatus(pcr)
|
obistatus[name] = ObicleanStatus(pcr)
|
||||||
|
|
||||||
|
obiweight := Weight(pcr.Sequence)
|
||||||
|
obiweight[name] = pcr.Weight
|
||||||
}
|
}
|
||||||
bar.Add(1)
|
bar.Add(1)
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user