Force sequence reading to produce lowercase sequences.

Adds two columns to the obiclean ratio csv file
This commit is contained in:
2022-11-22 15:06:09 +01:00
parent f4daa7f97f
commit 20b16c0ba1
14 changed files with 294 additions and 23 deletions

View File

@ -1,6 +1,7 @@
package main
import (
"fmt"
"log"
"os"
"runtime/pprof"
@ -37,4 +38,6 @@ func main() {
identified := obitag.AssignTaxonomy(fs)
obiconvert.WriteBioSequences(identified, true)
fmt.Println("")
}

View File

@ -66,7 +66,56 @@ func InterfaceToInt(i interface{}) (val int, err error) {
case uint64:
val = int(t) // standardizes across systems
default:
err = &NotABoolean{"value attribute cannot be casted to an integer"}
err = &NotAnInteger{"value attribute cannot be casted to an integer"}
}
return
}
// NotAnInteger defines a new type of Error : "NotAnInteger"
type NotAnFloat64 struct {
message string
}
// Error() retreives the error message associated to the "NotAnInteger"
// error. Tha addition of that Error message make the "NotAnInteger"
// complying with the error interface
func (m *NotAnFloat64) Error() string {
return m.message
}
// InterfaceToInt converts a interface{} to an integer value if possible.
// If not a "NotAnInteger" error is returned via the err
// return value and val is set to 0.
func InterfaceToFloat64(i interface{}) (val float64, err error) {
err = nil
val = 0
switch t := i.(type) {
case int:
val = float64(t)
case int8:
val = float64(t) // standardizes across systems
case int16:
val = float64(t) // standardizes across systems
case int32:
val = float64(t) // standardizes across systems
case int64:
val = float64(t) // standardizes across systems
case float32:
val = float64(t) // standardizes across systems
case float64:
val = t // standardizes across systems
case uint8:
val = float64(t) // standardizes across systems
case uint16:
val = float64(t) // standardizes across systems
case uint32:
val = float64(t) // standardizes across systems
case uint64:
val = float64(t) // standardizes across systems
default:
err = &NotAnFloat64{"value attribute cannot be casted to a float value"}
}
return
}
@ -109,6 +158,45 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
return
}
// NotABoolean defines a new type of Error : "NotAMapInt"
type NotAMapFloat64 struct {
message string
}
// Error() retreives the error message associated to the "NotAnInteger"
// error. Tha addition of that Error message make the "NotAnInteger"
// complying with the error interface
func (m *NotAMapFloat64) Error() string {
return m.message
}
func InterfaceToFloat64Map(i interface{}) (val map[string]float64, err error) {
err = nil
switch i := i.(type) {
case map[string]float64:
val = i
case map[string]interface{}:
val = make(map[string]float64, len(i))
for k, v := range i {
val[k], err = InterfaceToFloat64(v)
if err != nil {
return
}
}
case map[string]int:
val = make(map[string]float64, len(i))
for k, v := range i {
val[k] = float64(v)
}
default:
err = &NotAMapFloat64{"value attribute cannot be casted to a map[string]float64"}
}
return
}
// NotABoolean defines a new type of Error : "NotABoolean"
type NotABoolean struct {
message string

View File

@ -199,7 +199,6 @@ func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64
Sleft = _notavail
default:
Sdiag = previous[x]
if bA[j-1] == bB[i-1] {
Sdiag = _incscore(Sdiag)
}

View File

@ -5,6 +5,131 @@ import (
"github.com/PaesslerAG/gval"
)
func maxIntVector(values []int) float64 {
m := values[0]
for _,v := range values {
if v > m {
m = v
}
}
return float64(m)
}
func maxIntMap(values map[string]int) float64 {
var m int
first := true
for _,v := range values {
if first {
first = false
m = v
} else {
if v > m {
m = v
}
}
}
return float64(m)
}
func minIntVector(values []int) float64 {
m := values[0]
for _,v := range values {
if v < m {
m = v
}
}
return float64(m)
}
func minIntMap(values map[string]int) float64 {
var m int
first := true
for _,v := range values {
if first {
first = false
m = v
} else {
if v < m {
m = v
}
}
}
return float64(m)
}
func maxFloatVector(values []float64) float64 {
m := values[0]
for _,v := range values {
if v > m {
m = v
}
}
return m
}
func maxFloatMap(values map[string]float64) float64 {
var m float64
first := true
for _,v := range values {
if first {
first = false
m = v
} else {
if v > m {
m = v
}
}
}
return m
}
func minFloatVector(values []float64) float64 {
m := values[0]
for _,v := range values {
if v < m {
m = v
}
}
return m
}
func minFloatMap(values map[string]float64) float64 {
var m float64
first := true
for _,v := range values {
if first {
first = false
m = v
} else {
if v < m {
m = v
}
}
}
return m
}
// func maxNumeric(args ...interface{}) (interface{}, error) {
// var m float64
// first := true
// for _, v := range args {
// switch {
// case
// }
// }
// }
var OBILang = gval.NewLanguage(
gval.Full(),
gval.Function("len", func(args ...interface{}) (interface{}, error) {

View File

@ -1,6 +1,7 @@
package obiformats
import (
"bytes"
"compress/gzip"
"encoding/csv"
"fmt"
@ -67,7 +68,7 @@ func __read_ecopcr_bioseq__(file *__ecopcr_file__) (*obiseq.BioSequence, error)
comment = strings.TrimSpace(record[19])
}
bseq := obiseq.NewBioSequence(name, sequence, comment)
bseq := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
annotation := bseq.Annotations()
annotation["ac"] = name

View File

@ -128,7 +128,7 @@ func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {
}
case line == "//":
sequence := obiseq.NewBioSequence(id,
seqBytes.Bytes(),
bytes.ToLower(seqBytes.Bytes()),
defBytes.String())
sequence.SetFeatures(featBytes.Bytes())

View File

@ -7,6 +7,7 @@ package obiformats
import "C"
import (
"bytes"
"fmt"
"os"
"unsafe"
@ -38,7 +39,7 @@ func _FastseqReader(seqfile C.fast_kseq_p,
comment = ""
}
rep := obiseq.NewBioSequence(name, sequence, comment)
rep := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
if s.qual.l > C.ulong(0) {
cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l))

View File

@ -84,7 +84,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {
case line == "//":
sequence := obiseq.NewBioSequence(id,
seqBytes.Bytes(),
bytes.ToLower(seqBytes.Bytes()),
defBytes.String())
state = inHeader

View File

@ -11,6 +11,7 @@
package obiseq
import (
"bytes"
"crypto/md5"
"fmt"
"strconv"
@ -370,7 +371,7 @@ func (s *BioSequence) SetSequence(sequence []byte) {
if s.sequence != nil {
RecycleSlice(&s.sequence)
}
s.sequence = sequence
s.sequence = bytes.ToLower(sequence)
}
// Setting the qualities of the BioSequence.

View File

@ -209,9 +209,7 @@ func ExpressionPredicat(expression string) SequencePredicate {
f := func(sequence *BioSequence) bool {
value, err := exp.EvalBool(context.Background(),
map[string]interface{}{
"annot": sequence.Annotations(),
"count": sequence.Count(),
"seqlength": sequence.Len(),
"annotations": sequence.Annotations(),
"sequence": sequence,
},
)

View File

@ -18,6 +18,8 @@ import (
type Ratio struct {
Sample string
SeqID string
status string
From int
To int
CFrom int
@ -97,12 +99,14 @@ func EmpiricalDistCsv(filename string, data [][]Ratio) {
bar := progressbar.NewOptions(len(data), pbopt...)
fmt.Fprintln(file, "Sample,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
fmt.Fprintln(file, "Sample,Father_id,Father_status,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
for code, dist := range data {
a1, a2 := intToNucPair(code)
for _, ratio := range dist {
fmt.Fprintf(file, "%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
fmt.Fprintf(file, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
ratio.Sample,
ratio.SeqID,
ratio.status,
a1, a2,
ratio.From,
ratio.To,
@ -463,7 +467,13 @@ func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
for _, edge := range seq.Edges {
father := (*seqs)[edge.Father]
if father.Weight >= minStatRatio && edge.Dist == 1 {
ratio[edge.NucPair] = append(ratio[edge.NucPair], Ratio{name, father.Weight, seq.Weight, father.Count, seq.Count, edge.Pos, father.Sequence.Len()})
ratio[edge.NucPair] = append(ratio[edge.NucPair],
Ratio{name,
father.Sequence.Id(), Status(father.Sequence)[name],
father.Weight, seq.Weight,
father.Count, seq.Count,
edge.Pos,
father.Sequence.Len()})
}
}

View File

@ -19,6 +19,7 @@ type seqPCR struct {
SonCount int
AddedSons int
Edges []Edge
Cluster map[int]bool // used as the set of head sequences associated to that sequence
}
// buildSamples sorts the sequences by samples
@ -183,13 +184,53 @@ func GetMutation(sequence *obiseq.BioSequence) map[string]string {
return mutation
}
func GetCluster(sequence *obiseq.BioSequence) map[string]string {
annotation := sequence.Annotations()
icluster, ok := annotation["obiclean_cluster"]
var cluster map[string]string
if ok {
switch icluster := icluster.(type) {
case map[string]string:
cluster = icluster
case map[string]interface{}:
cluster = make(map[string]string)
for k, v := range icluster {
cluster[k] = fmt.Sprint(v)
}
}
} else {
cluster = make(map[string]string)
annotation["obiclean_cluster"] = cluster
}
return cluster
}
// func Cluster(sample map[string]*([]*seqPCR)) {
// for _, graph := range sample {
// for _, s := range *graph {
// cluster := GetCluster(s.Sequence)
// if len(s.Edges) > 0 {
// for _, f := range s.Edges {
// }
// } else {
// cluster
// }
// }
// }
// }
func Mutation(sample map[string]*([]*seqPCR)) {
for _, graph := range sample {
for _, s := range *graph {
for _, f := range s.Edges {
id := (*graph)[f.Father].Sequence.Id()
GetMutation(s.Sequence)[id] = fmt.Sprintf("(%c)->(%c)@%d",
f.From, f.To, f.Pos + 1)
f.From, f.To, f.Pos+1)
}
}
}
@ -277,14 +318,6 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
}
}
if IsSaveRatioTable() {
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
}
if SaveGraphToFiles() {
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
}
Mutation(samples)
@ -310,6 +343,16 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
bar.Add(1)
}
if SaveGraphToFiles() {
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
}
if IsSaveRatioTable() {
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
}
iter := annotateOBIClean(db, samples, SampleAttribute(), "NA")
if OnlyHead() {

View File

@ -56,5 +56,5 @@ func IExtractBarcode(iterator obiiter.IBioSequenceBatch) (obiiter.IBioSequenceBa
}
log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers())
return newIter, nil
return newIter.Speed("Demultiplexing"), nil
}

View File

@ -66,7 +66,9 @@ func FindClosests(sequence *obiseq.BioSequence,
// log.Println(sequence.Id(),cw[j], maxe)
if runExact || (atMost <= (maxe + 1)) {
// if true {
lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix)
// fmt.Println(j, cw[j], lcs, alilength, alilength-lcs)
// lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix)
n++
if lcs == -1 {