Force sequence reading to produce lowercase sequences.

Adds two columns to the obiclean ratio csv file
This commit is contained in:
2022-11-22 15:06:09 +01:00
parent f4daa7f97f
commit 20b16c0ba1
14 changed files with 294 additions and 23 deletions

View File

@ -1,6 +1,7 @@
package main package main
import ( import (
"fmt"
"log" "log"
"os" "os"
"runtime/pprof" "runtime/pprof"
@ -37,4 +38,6 @@ func main() {
identified := obitag.AssignTaxonomy(fs) identified := obitag.AssignTaxonomy(fs)
obiconvert.WriteBioSequences(identified, true) obiconvert.WriteBioSequences(identified, true)
fmt.Println("")
} }

View File

@ -66,7 +66,56 @@ func InterfaceToInt(i interface{}) (val int, err error) {
case uint64: case uint64:
val = int(t) // standardizes across systems val = int(t) // standardizes across systems
default: default:
err = &NotABoolean{"value attribute cannot be casted to an integer"} err = &NotAnInteger{"value attribute cannot be casted to an integer"}
}
return
}
// NotAnInteger defines a new type of Error : "NotAnInteger"
type NotAnFloat64 struct {
message string
}
// Error() retreives the error message associated to the "NotAnInteger"
// error. Tha addition of that Error message make the "NotAnInteger"
// complying with the error interface
func (m *NotAnFloat64) Error() string {
return m.message
}
// InterfaceToInt converts a interface{} to an integer value if possible.
// If not a "NotAnInteger" error is returned via the err
// return value and val is set to 0.
func InterfaceToFloat64(i interface{}) (val float64, err error) {
err = nil
val = 0
switch t := i.(type) {
case int:
val = float64(t)
case int8:
val = float64(t) // standardizes across systems
case int16:
val = float64(t) // standardizes across systems
case int32:
val = float64(t) // standardizes across systems
case int64:
val = float64(t) // standardizes across systems
case float32:
val = float64(t) // standardizes across systems
case float64:
val = t // standardizes across systems
case uint8:
val = float64(t) // standardizes across systems
case uint16:
val = float64(t) // standardizes across systems
case uint32:
val = float64(t) // standardizes across systems
case uint64:
val = float64(t) // standardizes across systems
default:
err = &NotAnFloat64{"value attribute cannot be casted to a float value"}
} }
return return
} }
@ -109,6 +158,45 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
return return
} }
// NotABoolean defines a new type of Error : "NotAMapInt"
type NotAMapFloat64 struct {
message string
}
// Error() retreives the error message associated to the "NotAnInteger"
// error. Tha addition of that Error message make the "NotAnInteger"
// complying with the error interface
func (m *NotAMapFloat64) Error() string {
return m.message
}
func InterfaceToFloat64Map(i interface{}) (val map[string]float64, err error) {
err = nil
switch i := i.(type) {
case map[string]float64:
val = i
case map[string]interface{}:
val = make(map[string]float64, len(i))
for k, v := range i {
val[k], err = InterfaceToFloat64(v)
if err != nil {
return
}
}
case map[string]int:
val = make(map[string]float64, len(i))
for k, v := range i {
val[k] = float64(v)
}
default:
err = &NotAMapFloat64{"value attribute cannot be casted to a map[string]float64"}
}
return
}
// NotABoolean defines a new type of Error : "NotABoolean" // NotABoolean defines a new type of Error : "NotABoolean"
type NotABoolean struct { type NotABoolean struct {
message string message string

View File

@ -199,7 +199,6 @@ func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64
Sleft = _notavail Sleft = _notavail
default: default:
Sdiag = previous[x] Sdiag = previous[x]
if bA[j-1] == bB[i-1] { if bA[j-1] == bB[i-1] {
Sdiag = _incscore(Sdiag) Sdiag = _incscore(Sdiag)
} }

View File

@ -5,6 +5,131 @@ import (
"github.com/PaesslerAG/gval" "github.com/PaesslerAG/gval"
) )
func maxIntVector(values []int) float64 {
m := values[0]
for _,v := range values {
if v > m {
m = v
}
}
return float64(m)
}
func maxIntMap(values map[string]int) float64 {
var m int
first := true
for _,v := range values {
if first {
first = false
m = v
} else {
if v > m {
m = v
}
}
}
return float64(m)
}
func minIntVector(values []int) float64 {
m := values[0]
for _,v := range values {
if v < m {
m = v
}
}
return float64(m)
}
func minIntMap(values map[string]int) float64 {
var m int
first := true
for _,v := range values {
if first {
first = false
m = v
} else {
if v < m {
m = v
}
}
}
return float64(m)
}
func maxFloatVector(values []float64) float64 {
m := values[0]
for _,v := range values {
if v > m {
m = v
}
}
return m
}
func maxFloatMap(values map[string]float64) float64 {
var m float64
first := true
for _,v := range values {
if first {
first = false
m = v
} else {
if v > m {
m = v
}
}
}
return m
}
func minFloatVector(values []float64) float64 {
m := values[0]
for _,v := range values {
if v < m {
m = v
}
}
return m
}
func minFloatMap(values map[string]float64) float64 {
var m float64
first := true
for _,v := range values {
if first {
first = false
m = v
} else {
if v < m {
m = v
}
}
}
return m
}
// func maxNumeric(args ...interface{}) (interface{}, error) {
// var m float64
// first := true
// for _, v := range args {
// switch {
// case
// }
// }
// }
var OBILang = gval.NewLanguage( var OBILang = gval.NewLanguage(
gval.Full(), gval.Full(),
gval.Function("len", func(args ...interface{}) (interface{}, error) { gval.Function("len", func(args ...interface{}) (interface{}, error) {

View File

@ -1,6 +1,7 @@
package obiformats package obiformats
import ( import (
"bytes"
"compress/gzip" "compress/gzip"
"encoding/csv" "encoding/csv"
"fmt" "fmt"
@ -67,7 +68,7 @@ func __read_ecopcr_bioseq__(file *__ecopcr_file__) (*obiseq.BioSequence, error)
comment = strings.TrimSpace(record[19]) comment = strings.TrimSpace(record[19])
} }
bseq := obiseq.NewBioSequence(name, sequence, comment) bseq := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
annotation := bseq.Annotations() annotation := bseq.Annotations()
annotation["ac"] = name annotation["ac"] = name

View File

@ -128,7 +128,7 @@ func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {
} }
case line == "//": case line == "//":
sequence := obiseq.NewBioSequence(id, sequence := obiseq.NewBioSequence(id,
seqBytes.Bytes(), bytes.ToLower(seqBytes.Bytes()),
defBytes.String()) defBytes.String())
sequence.SetFeatures(featBytes.Bytes()) sequence.SetFeatures(featBytes.Bytes())

View File

@ -7,6 +7,7 @@ package obiformats
import "C" import "C"
import ( import (
"bytes"
"fmt" "fmt"
"os" "os"
"unsafe" "unsafe"
@ -38,7 +39,7 @@ func _FastseqReader(seqfile C.fast_kseq_p,
comment = "" comment = ""
} }
rep := obiseq.NewBioSequence(name, sequence, comment) rep := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
if s.qual.l > C.ulong(0) { if s.qual.l > C.ulong(0) {
cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l)) cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l))

View File

@ -84,7 +84,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {
case line == "//": case line == "//":
sequence := obiseq.NewBioSequence(id, sequence := obiseq.NewBioSequence(id,
seqBytes.Bytes(), bytes.ToLower(seqBytes.Bytes()),
defBytes.String()) defBytes.String())
state = inHeader state = inHeader

View File

@ -11,6 +11,7 @@
package obiseq package obiseq
import ( import (
"bytes"
"crypto/md5" "crypto/md5"
"fmt" "fmt"
"strconv" "strconv"
@ -370,7 +371,7 @@ func (s *BioSequence) SetSequence(sequence []byte) {
if s.sequence != nil { if s.sequence != nil {
RecycleSlice(&s.sequence) RecycleSlice(&s.sequence)
} }
s.sequence = sequence s.sequence = bytes.ToLower(sequence)
} }
// Setting the qualities of the BioSequence. // Setting the qualities of the BioSequence.

View File

@ -209,9 +209,7 @@ func ExpressionPredicat(expression string) SequencePredicate {
f := func(sequence *BioSequence) bool { f := func(sequence *BioSequence) bool {
value, err := exp.EvalBool(context.Background(), value, err := exp.EvalBool(context.Background(),
map[string]interface{}{ map[string]interface{}{
"annot": sequence.Annotations(), "annotations": sequence.Annotations(),
"count": sequence.Count(),
"seqlength": sequence.Len(),
"sequence": sequence, "sequence": sequence,
}, },
) )

View File

@ -18,6 +18,8 @@ import (
type Ratio struct { type Ratio struct {
Sample string Sample string
SeqID string
status string
From int From int
To int To int
CFrom int CFrom int
@ -97,12 +99,14 @@ func EmpiricalDistCsv(filename string, data [][]Ratio) {
bar := progressbar.NewOptions(len(data), pbopt...) bar := progressbar.NewOptions(len(data), pbopt...)
fmt.Fprintln(file, "Sample,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length") fmt.Fprintln(file, "Sample,Father_id,Father_status,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
for code, dist := range data { for code, dist := range data {
a1, a2 := intToNucPair(code) a1, a2 := intToNucPair(code)
for _, ratio := range dist { for _, ratio := range dist {
fmt.Fprintf(file, "%s,%c,%c,%d,%d,%d,%d,%d,%d\n", fmt.Fprintf(file, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
ratio.Sample, ratio.Sample,
ratio.SeqID,
ratio.status,
a1, a2, a1, a2,
ratio.From, ratio.From,
ratio.To, ratio.To,
@ -463,7 +467,13 @@ func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
for _, edge := range seq.Edges { for _, edge := range seq.Edges {
father := (*seqs)[edge.Father] father := (*seqs)[edge.Father]
if father.Weight >= minStatRatio && edge.Dist == 1 { if father.Weight >= minStatRatio && edge.Dist == 1 {
ratio[edge.NucPair] = append(ratio[edge.NucPair], Ratio{name, father.Weight, seq.Weight, father.Count, seq.Count, edge.Pos, father.Sequence.Len()}) ratio[edge.NucPair] = append(ratio[edge.NucPair],
Ratio{name,
father.Sequence.Id(), Status(father.Sequence)[name],
father.Weight, seq.Weight,
father.Count, seq.Count,
edge.Pos,
father.Sequence.Len()})
} }
} }

View File

@ -19,6 +19,7 @@ type seqPCR struct {
SonCount int SonCount int
AddedSons int AddedSons int
Edges []Edge Edges []Edge
Cluster map[int]bool // used as the set of head sequences associated to that sequence
} }
// buildSamples sorts the sequences by samples // buildSamples sorts the sequences by samples
@ -183,13 +184,53 @@ func GetMutation(sequence *obiseq.BioSequence) map[string]string {
return mutation return mutation
} }
func GetCluster(sequence *obiseq.BioSequence) map[string]string {
annotation := sequence.Annotations()
icluster, ok := annotation["obiclean_cluster"]
var cluster map[string]string
if ok {
switch icluster := icluster.(type) {
case map[string]string:
cluster = icluster
case map[string]interface{}:
cluster = make(map[string]string)
for k, v := range icluster {
cluster[k] = fmt.Sprint(v)
}
}
} else {
cluster = make(map[string]string)
annotation["obiclean_cluster"] = cluster
}
return cluster
}
// func Cluster(sample map[string]*([]*seqPCR)) {
// for _, graph := range sample {
// for _, s := range *graph {
// cluster := GetCluster(s.Sequence)
// if len(s.Edges) > 0 {
// for _, f := range s.Edges {
// }
// } else {
// cluster
// }
// }
// }
// }
func Mutation(sample map[string]*([]*seqPCR)) { func Mutation(sample map[string]*([]*seqPCR)) {
for _, graph := range sample { for _, graph := range sample {
for _, s := range *graph { for _, s := range *graph {
for _, f := range s.Edges { for _, f := range s.Edges {
id := (*graph)[f.Father].Sequence.Id() id := (*graph)[f.Father].Sequence.Id()
GetMutation(s.Sequence)[id] = fmt.Sprintf("(%c)->(%c)@%d", GetMutation(s.Sequence)[id] = fmt.Sprintf("(%c)->(%c)@%d",
f.From, f.To, f.Pos + 1) f.From, f.To, f.Pos+1)
} }
} }
} }
@ -277,14 +318,6 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
} }
} }
if IsSaveRatioTable() {
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
}
if SaveGraphToFiles() {
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
}
Mutation(samples) Mutation(samples)
@ -310,6 +343,16 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
bar.Add(1) bar.Add(1)
} }
if SaveGraphToFiles() {
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
}
if IsSaveRatioTable() {
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
}
iter := annotateOBIClean(db, samples, SampleAttribute(), "NA") iter := annotateOBIClean(db, samples, SampleAttribute(), "NA")
if OnlyHead() { if OnlyHead() {

View File

@ -56,5 +56,5 @@ func IExtractBarcode(iterator obiiter.IBioSequenceBatch) (obiiter.IBioSequenceBa
} }
log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers()) log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers())
return newIter, nil return newIter.Speed("Demultiplexing"), nil
} }

View File

@ -66,7 +66,9 @@ func FindClosests(sequence *obiseq.BioSequence,
// log.Println(sequence.Id(),cw[j], maxe) // log.Println(sequence.Id(),cw[j], maxe)
if runExact || (atMost <= (maxe + 1)) { if runExact || (atMost <= (maxe + 1)) {
// if true {
lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix) lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix)
// fmt.Println(j, cw[j], lcs, alilength, alilength-lcs)
// lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix) // lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix)
n++ n++
if lcs == -1 { if lcs == -1 {