Force sequence reading to produce lowercase sequences.

Adds two columns to the obiclean ratio csv file
2025-06-29 16:20:46 +00:00 · 2022-11-22 15:06:09 +01:00
parent f4daa7f97f
commit 20b16c0ba1
14 changed files with 294 additions and 23 deletions
--- a/cmd/obitools/obitag/main.go
+++ b/cmd/obitools/obitag/main.go
@ -1,6 +1,7 @@
 package main
 import (
 	"fmt"
 	"log"
 	"os"
 	"runtime/pprof"
@ -37,4 +38,6 @@ func main() {
 	identified := obitag.AssignTaxonomy(fs)
 	obiconvert.WriteBioSequences(identified, true)
 	fmt.Println("")
 }
--- a/pkg/goutils/goutils.go
+++ b/pkg/goutils/goutils.go
@ -66,7 +66,56 @@ func InterfaceToInt(i interface{}) (val int, err error) {
 	case uint64:
 		val = int(t) // standardizes across systems
 	default:
-		err = &NotABoolean{"value attribute cannot be casted to an integer"}
+		err = &NotAnInteger{"value attribute cannot be casted to an integer"}
 	}
 	return
 }
 // NotAnInteger defines a new type of Error : "NotAnInteger"
 type NotAnFloat64 struct {
 	message string
 }
 // Error() retreives the error message associated to the "NotAnInteger"
 // error. Tha addition of that Error message make the "NotAnInteger"
 // complying with the error interface
 func (m *NotAnFloat64) Error() string {
 	return m.message
 }
 // InterfaceToInt converts a interface{} to an integer value if possible.
 // If not a "NotAnInteger" error is returned via the err
 // return value and val is set to 0.
 func InterfaceToFloat64(i interface{}) (val float64, err error) {
 	err = nil
 	val = 0
 	switch t := i.(type) {
 	case int:
 		val = float64(t)
 	case int8:
 		val = float64(t) // standardizes across systems
 	case int16:
 		val = float64(t) // standardizes across systems
 	case int32:
 		val = float64(t) // standardizes across systems
 	case int64:
 		val = float64(t) // standardizes across systems
 	case float32:
 		val = float64(t) // standardizes across systems
 	case float64:
 		val = t // standardizes across systems
 	case uint8:
 		val = float64(t) // standardizes across systems
 	case uint16:
 		val = float64(t) // standardizes across systems
 	case uint32:
 		val = float64(t) // standardizes across systems
 	case uint64:
 		val = float64(t) // standardizes across systems
 	default:
 		err = &NotAnFloat64{"value attribute cannot be casted to a float value"}
 	}
 	return
 }
@ -109,6 +158,45 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
 	return
 }
 // NotABoolean defines a new type of Error : "NotAMapInt"
 type NotAMapFloat64 struct {
 	message string
 }
 // Error() retreives the error message associated to the "NotAnInteger"
 // error. Tha addition of that Error message make the "NotAnInteger"
 // complying with the error interface
 func (m *NotAMapFloat64) Error() string {
 	return m.message
 }
 func InterfaceToFloat64Map(i interface{}) (val map[string]float64, err error) {
 	err = nil
 	switch i := i.(type) {
 	case map[string]float64:
 		val = i
 	case map[string]interface{}:
 		val = make(map[string]float64, len(i))
 		for k, v := range i {
 			val[k], err = InterfaceToFloat64(v)
 			if err != nil {
 				return
 			}
 		}
 	case map[string]int:
 		val = make(map[string]float64, len(i))
 		for k, v := range i {
 			val[k] = float64(v)
 		}
 	default:
 		err = &NotAMapFloat64{"value attribute cannot be casted to a map[string]float64"}
 	}
 	return
 }
 // NotABoolean defines a new type of Error : "NotABoolean"
 type NotABoolean struct {
 	message string
--- a/pkg/obialign/fastlcs.go
+++ b/pkg/obialign/fastlcs.go
@ -199,7 +199,6 @@ func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64
 				Sleft = _notavail
 			default:
 				Sdiag = previous[x]
 				if bA[j-1] == bB[i-1] {
 					Sdiag = _incscore(Sdiag)
 				}
--- a/pkg/obieval/language.go
+++ b/pkg/obieval/language.go
@ -5,6 +5,131 @@ import (
 	"github.com/PaesslerAG/gval"
 )
 func maxIntVector(values []int) float64 {
 	m := values[0]
 	for _,v := range values {
 		if v > m {
 			m = v
 		}
 	}
 	return float64(m)
 }
 func maxIntMap(values  map[string]int) float64 {
 	var m int
 	first := true
 	for _,v := range values {
 		if first {
 			first = false
 			m = v
 		} else {
 			if v > m {
 				m = v
 			}	
 		}
 	}
 	return float64(m)
 }
 func minIntVector(values []int) float64 {
 	m := values[0]
 	for _,v := range values {
 		if v < m {
 			m = v
 		}
 	}
 	return float64(m)
 }
 func minIntMap(values  map[string]int) float64 {
 	var m int
 	first := true
 	for _,v := range values {
 		if first {
 			first = false
 			m = v
 		} else {
 			if v < m {
 				m = v
 			}	
 		}
 	}
 	return float64(m)
 }
 func maxFloatVector(values []float64) float64 {
 	m := values[0]
 	for _,v := range values {
 		if v > m {
 			m = v
 		}
 	}
 	return m
 }
 func maxFloatMap(values  map[string]float64) float64 {
 	var m float64
 	first := true
 	for _,v := range values {
 		if first {
 			first = false
 			m = v
 		} else {
 			if v > m {
 				m = v
 			}	
 		}
 	}
 	return m
 }
 func minFloatVector(values []float64) float64 {
 	m := values[0]
 	for _,v := range values {
 		if v < m {
 			m = v
 		}
 	}
 	return m
 }
 func minFloatMap(values  map[string]float64) float64 {
 	var m float64
 	first := true
 	for _,v := range values {
 		if first {
 			first = false
 			m = v
 		} else {
 			if v < m {
 				m = v
 			}	
 		}
 	}
 	return m
 }
 // func maxNumeric(args ...interface{}) (interface{}, error) {
 // 	var m float64 
 //     first := true
 // 	for _, v := range args {
 // 		switch {
 // 			case 
 // 		}
 // 	}
 // }
 var OBILang = gval.NewLanguage(
 	gval.Full(),
 	gval.Function("len", func(args ...interface{}) (interface{}, error) {
--- a/pkg/obiformats/ecopcr_read.go
+++ b/pkg/obiformats/ecopcr_read.go
@ -1,6 +1,7 @@
 package obiformats
 import (
 	"bytes"
 	"compress/gzip"
 	"encoding/csv"
 	"fmt"
@ -67,7 +68,7 @@ func __read_ecopcr_bioseq__(file *__ecopcr_file__) (*obiseq.BioSequence, error)
 		comment = strings.TrimSpace(record[19])
 	}
-	bseq := obiseq.NewBioSequence(name, sequence, comment)
+	bseq := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
 	annotation := bseq.Annotations()
 	annotation["ac"] = name
--- a/pkg/obiformats/embl_read.go
+++ b/pkg/obiformats/embl_read.go
@ -128,7 +128,7 @@ func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {
 				}
 			case line == "//":
 				sequence := obiseq.NewBioSequence(id,
-					seqBytes.Bytes(),
+					bytes.ToLower(seqBytes.Bytes()),
 					defBytes.String())
 				sequence.SetFeatures(featBytes.Bytes())
--- a/pkg/obiformats/fastseq_read.go
+++ b/pkg/obiformats/fastseq_read.go
@ -7,6 +7,7 @@ package obiformats
 import "C"
 import (
 	"bytes"
 	"fmt"
 	"os"
 	"unsafe"
@ -38,7 +39,7 @@ func _FastseqReader(seqfile C.fast_kseq_p,
 			comment = ""
 		}
-		rep := obiseq.NewBioSequence(name, sequence, comment)
+		rep := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
 		if s.qual.l > C.ulong(0) {
 			cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l))
--- a/pkg/obiformats/genbank_read.go
+++ b/pkg/obiformats/genbank_read.go
@ -84,7 +84,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {
 			case line == "//":
 				sequence := obiseq.NewBioSequence(id,
-					seqBytes.Bytes(),
+					bytes.ToLower(seqBytes.Bytes()),
 					defBytes.String())
 				state = inHeader
--- a/pkg/obiseq/biosequence.go
+++ b/pkg/obiseq/biosequence.go
@ -11,6 +11,7 @@
 package obiseq
 import (
 	"bytes"
 	"crypto/md5"
 	"fmt"
 	"strconv"
@ -370,7 +371,7 @@ func (s *BioSequence) SetSequence(sequence []byte) {
 	if s.sequence != nil {
 		RecycleSlice(&s.sequence)
 	}
-	s.sequence = sequence
+	s.sequence = bytes.ToLower(sequence)
 }
 // Setting the qualities of the BioSequence.
--- a/pkg/obiseq/predicate.go
+++ b/pkg/obiseq/predicate.go
@ -209,9 +209,7 @@ func ExpressionPredicat(expression string) SequencePredicate {
 	f := func(sequence *BioSequence) bool {
 		value, err := exp.EvalBool(context.Background(),
 			map[string]interface{}{
-				"annot":     sequence.Annotations(),
+				"annotations":     sequence.Annotations(),
 				"count":     sequence.Count(),
 				"seqlength": sequence.Len(),
 				"sequence":  sequence,
 			},
 		)
--- a/pkg/obitools/obiclean/graph.go
+++ b/pkg/obitools/obiclean/graph.go
@ -18,6 +18,8 @@ import (
 type Ratio struct {
 	Sample string
 	SeqID  string
 	status string
 	From   int
 	To     int
 	CFrom  int
@ -97,12 +99,14 @@ func EmpiricalDistCsv(filename string, data [][]Ratio) {
 	bar := progressbar.NewOptions(len(data), pbopt...)
-	fmt.Fprintln(file, "Sample,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
+	fmt.Fprintln(file, "Sample,Father_id,Father_status,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
 	for code, dist := range data {
 		a1, a2 := intToNucPair(code)
 		for _, ratio := range dist {
-			fmt.Fprintf(file, "%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
+			fmt.Fprintf(file, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
 				ratio.Sample,
 				ratio.SeqID,
 				ratio.status,
 				a1, a2,
 				ratio.From,
 				ratio.To,
@ -463,7 +467,13 @@ func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
 			for _, edge := range seq.Edges {
 				father := (*seqs)[edge.Father]
 				if father.Weight >= minStatRatio && edge.Dist == 1 {
-					ratio[edge.NucPair] = append(ratio[edge.NucPair], Ratio{name, father.Weight, seq.Weight, father.Count, seq.Count, edge.Pos, father.Sequence.Len()})
+					ratio[edge.NucPair] = append(ratio[edge.NucPair],
 						Ratio{name,
 							father.Sequence.Id(), Status(father.Sequence)[name],
 							father.Weight, seq.Weight,
 							father.Count, seq.Count,
 							edge.Pos,
 							father.Sequence.Len()})
 				}
 			}
--- a/pkg/obitools/obiclean/obiclean.go
+++ b/pkg/obitools/obiclean/obiclean.go
@ -19,6 +19,7 @@ type seqPCR struct {
 	SonCount  int
 	AddedSons int
 	Edges     []Edge
 	Cluster   map[int]bool        // used as the set of head sequences associated to that sequence 
 }
 // buildSamples sorts the sequences by samples
@ -183,13 +184,53 @@ func GetMutation(sequence *obiseq.BioSequence) map[string]string {
 	return mutation
 }
 func GetCluster(sequence *obiseq.BioSequence) map[string]string {
 	annotation := sequence.Annotations()
 	icluster, ok := annotation["obiclean_cluster"]
 	var cluster map[string]string
 	if ok {
 		switch icluster := icluster.(type) {
 		case map[string]string:
 			cluster = icluster
 		case map[string]interface{}:
 			cluster = make(map[string]string)
 			for k, v := range icluster {
 				cluster[k] = fmt.Sprint(v)
 			}
 		}
 	} else {
 		cluster = make(map[string]string)
 		annotation["obiclean_cluster"] = cluster
 	}
 	return cluster
 }
 // func Cluster(sample map[string]*([]*seqPCR)) {
 // 	for _, graph := range sample {
 // 		for _, s := range *graph {
 // 			cluster := GetCluster(s.Sequence)
 // 			if len(s.Edges) > 0 {
 // 				for _, f := range s.Edges {
 // 				}	
 // 			} else {
 // 				cluster
 // 			}
 // 		}
 // 	}
 // }
 func Mutation(sample map[string]*([]*seqPCR)) {
 	for _, graph := range sample {
 		for _, s := range *graph {
 			for _, f := range s.Edges {
 				id := (*graph)[f.Father].Sequence.Id()
 				GetMutation(s.Sequence)[id] = fmt.Sprintf("(%c)->(%c)@%d",
-					f.From, f.To, f.Pos + 1)
+					f.From, f.To, f.Pos+1)
 			}
 		}
 	}
@ -277,14 +318,6 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
 		}
 	}
 	if IsSaveRatioTable() {
 		all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
 		EmpiricalDistCsv(RatioTableFilename(), all_ratio)
 	}
 	if SaveGraphToFiles() {
 		SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
 	}
 	Mutation(samples)
@ -310,6 +343,16 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
 		bar.Add(1)
 	}
 	if SaveGraphToFiles() {
 		SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
 	}
 	if IsSaveRatioTable() {
 		all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
 		EmpiricalDistCsv(RatioTableFilename(), all_ratio)
 	}
 	iter := annotateOBIClean(db, samples, SampleAttribute(), "NA")
 	if OnlyHead() {
--- a/pkg/obitools/obimultiplex/demultiplex.go
+++ b/pkg/obitools/obimultiplex/demultiplex.go
@ -56,5 +56,5 @@ func IExtractBarcode(iterator obiiter.IBioSequenceBatch) (obiiter.IBioSequenceBa
 	}
 	log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers())
-	return newIter, nil
+	return newIter.Speed("Demultiplexing"), nil
 }
--- a/pkg/obitools/obitag/obitag.go
+++ b/pkg/obitools/obitag/obitag.go
@ -66,7 +66,9 @@ func FindClosests(sequence *obiseq.BioSequence,
 		// log.Println(sequence.Id(),cw[j], maxe)
 		if runExact || (atMost <= (maxe + 1)) {
 			// if true {
 			lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix)
 			// fmt.Println(j, cw[j], lcs, alilength, alilength-lcs)
 			// lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix)
 			n++
 			if lcs == -1 {