Force sequence reading to produce lowercase sequences.

Adds two columns to the obiclean ratio csv file
2025-06-29 16:20:46 +00:00 · 2022-11-22 15:06:09 +01:00
parent f4daa7f97f
commit 20b16c0ba1
14 changed files with 294 additions and 23 deletions
--- a/cmd/obitools/obitag/main.go
+++ b/cmd/obitools/obitag/main.go
@ -1,6 +1,7 @@
 package main

 import (
+	"fmt"
 	"log"
 	"os"
 	"runtime/pprof"
@ -37,4 +38,6 @@ func main() {
 	identified := obitag.AssignTaxonomy(fs)

 	obiconvert.WriteBioSequences(identified, true)
+
+	fmt.Println("")
 }
--- a/pkg/goutils/goutils.go
+++ b/pkg/goutils/goutils.go
@ -66,7 +66,56 @@ func InterfaceToInt(i interface{}) (val int, err error) {
 	case uint64:
 		val = int(t) // standardizes across systems
 	default:
-		err = &NotABoolean{"value attribute cannot be casted to an integer"}
+		err = &NotAnInteger{"value attribute cannot be casted to an integer"}
+	}
+	return
+}
+
+// NotAnInteger defines a new type of Error : "NotAnInteger"
+type NotAnFloat64 struct {
+	message string
+}
+
+// Error() retreives the error message associated to the "NotAnInteger"
+// error. Tha addition of that Error message make the "NotAnInteger"
+// complying with the error interface
+func (m *NotAnFloat64) Error() string {
+	return m.message
+}
+
+// InterfaceToInt converts a interface{} to an integer value if possible.
+// If not a "NotAnInteger" error is returned via the err
+// return value and val is set to 0.
+func InterfaceToFloat64(i interface{}) (val float64, err error) {
+
+	err = nil
+	val = 0
+
+	switch t := i.(type) {
+	case int:
+		val = float64(t)
+	case int8:
+		val = float64(t) // standardizes across systems
+	case int16:
+		val = float64(t) // standardizes across systems
+	case int32:
+		val = float64(t) // standardizes across systems
+	case int64:
+		val = float64(t) // standardizes across systems
+	case float32:
+		val = float64(t) // standardizes across systems
+	case float64:
+		val = t // standardizes across systems
+	case uint8:
+		val = float64(t) // standardizes across systems
+	case uint16:
+		val = float64(t) // standardizes across systems
+	case uint32:
+		val = float64(t) // standardizes across systems
+	case uint64:
+		val = float64(t) // standardizes across systems
+	default:
+		err = &NotAnFloat64{"value attribute cannot be casted to a float value"}
 	}
 	return
 }
@ -109,6 +158,45 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
 	return
 }

+// NotABoolean defines a new type of Error : "NotAMapInt"
+type NotAMapFloat64 struct {
+	message string
+}
+
+// Error() retreives the error message associated to the "NotAnInteger"
+// error. Tha addition of that Error message make the "NotAnInteger"
+// complying with the error interface
+func (m *NotAMapFloat64) Error() string {
+	return m.message
+}
+
+func InterfaceToFloat64Map(i interface{}) (val map[string]float64, err error) {
+	err = nil
+
+	switch i := i.(type) {
+	case map[string]float64:
+		val = i
+	case map[string]interface{}:
+		val = make(map[string]float64, len(i))
+		for k, v := range i {
+			val[k], err = InterfaceToFloat64(v)
+			if err != nil {
+				return
+			}
+		}
+	case map[string]int:
+		val = make(map[string]float64, len(i))
+		for k, v := range i {
+			val[k] = float64(v)
+		}
+	default:
+		err = &NotAMapFloat64{"value attribute cannot be casted to a map[string]float64"}
+	}
+
+	return
+}
+
+
 // NotABoolean defines a new type of Error : "NotABoolean"
 type NotABoolean struct {
 	message string
--- a/pkg/obialign/fastlcs.go
+++ b/pkg/obialign/fastlcs.go
@ -199,7 +199,6 @@ func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64
 				Sleft = _notavail
 			default:
 				Sdiag = previous[x]
-
 				if bA[j-1] == bB[i-1] {
 					Sdiag = _incscore(Sdiag)
 				}
--- a/pkg/obieval/language.go
+++ b/pkg/obieval/language.go
@ -5,6 +5,131 @@ import (
 	"github.com/PaesslerAG/gval"
 )

+func maxIntVector(values []int) float64 {
+	m := values[0]
+	for _,v := range values {
+		if v > m {
+			m = v
+		}
+	}
+
+	return float64(m)
+}
+
+func maxIntMap(values  map[string]int) float64 {
+	var m int
+	first := true
+	for _,v := range values {
+		if first {
+			first = false
+			m = v
+		} else {
+			if v > m {
+				m = v
+			}	
+		}
+	}
+
+	return float64(m)
+}
+
+func minIntVector(values []int) float64 {
+	m := values[0]
+	for _,v := range values {
+		if v < m {
+			m = v
+		}
+	}
+
+	return float64(m)
+}
+
+func minIntMap(values  map[string]int) float64 {
+	var m int
+	first := true
+	for _,v := range values {
+		if first {
+			first = false
+			m = v
+		} else {
+			if v < m {
+				m = v
+			}	
+		}
+	}
+
+	return float64(m)
+}
+
+
+func maxFloatVector(values []float64) float64 {
+	m := values[0]
+	for _,v := range values {
+		if v > m {
+			m = v
+		}
+	}
+
+	return m
+}
+
+func maxFloatMap(values  map[string]float64) float64 {
+	var m float64
+	first := true
+	for _,v := range values {
+		if first {
+			first = false
+			m = v
+		} else {
+			if v > m {
+				m = v
+			}	
+		}
+	}
+
+	return m
+}
+
+func minFloatVector(values []float64) float64 {
+	m := values[0]
+	for _,v := range values {
+		if v < m {
+			m = v
+		}
+	}
+
+	return m
+}
+
+func minFloatMap(values  map[string]float64) float64 {
+	var m float64
+	first := true
+	for _,v := range values {
+		if first {
+			first = false
+			m = v
+		} else {
+			if v < m {
+				m = v
+			}	
+		}
+	}
+
+	return m
+}
+
+// func maxNumeric(args ...interface{}) (interface{}, error) {
+// 	var m float64 
+//     first := true
+
+// 	for _, v := range args {
+// 		switch {
+// 			case 
+// 		}
+// 	}
+
+// }
+
 var OBILang = gval.NewLanguage(
 	gval.Full(),
 	gval.Function("len", func(args ...interface{}) (interface{}, error) {
--- a/pkg/obiformats/ecopcr_read.go
+++ b/pkg/obiformats/ecopcr_read.go
@ -1,6 +1,7 @@
 package obiformats

 import (
+	"bytes"
 	"compress/gzip"
 	"encoding/csv"
 	"fmt"
@ -67,7 +68,7 @@ func __read_ecopcr_bioseq__(file *__ecopcr_file__) (*obiseq.BioSequence, error)
 		comment = strings.TrimSpace(record[19])
 	}

-	bseq := obiseq.NewBioSequence(name, sequence, comment)
+	bseq := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)
 	annotation := bseq.Annotations()

 	annotation["ac"] = name
--- a/pkg/obiformats/embl_read.go
+++ b/pkg/obiformats/embl_read.go
@ -128,7 +128,7 @@ func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {
 				}
 			case line == "//":
 				sequence := obiseq.NewBioSequence(id,
-					seqBytes.Bytes(),
+					bytes.ToLower(seqBytes.Bytes()),
 					defBytes.String())

 				sequence.SetFeatures(featBytes.Bytes())
--- a/pkg/obiformats/fastseq_read.go
+++ b/pkg/obiformats/fastseq_read.go
@ -7,6 +7,7 @@ package obiformats
 import "C"

 import (
+	"bytes"
 	"fmt"
 	"os"
 	"unsafe"
@ -38,7 +39,7 @@ func _FastseqReader(seqfile C.fast_kseq_p,
 			comment = ""
 		}

-		rep := obiseq.NewBioSequence(name, sequence, comment)
+		rep := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment)

 		if s.qual.l > C.ulong(0) {
 			cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l))
--- a/pkg/obiformats/genbank_read.go
+++ b/pkg/obiformats/genbank_read.go
@ -84,7 +84,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) {

 			case line == "//":
 				sequence := obiseq.NewBioSequence(id,
-					seqBytes.Bytes(),
+					bytes.ToLower(seqBytes.Bytes()),
 					defBytes.String())
 				state = inHeader

--- a/pkg/obiseq/biosequence.go
+++ b/pkg/obiseq/biosequence.go
@ -11,6 +11,7 @@
 package obiseq

 import (
+	"bytes"
 	"crypto/md5"
 	"fmt"
 	"strconv"
@ -370,7 +371,7 @@ func (s *BioSequence) SetSequence(sequence []byte) {
 	if s.sequence != nil {
 		RecycleSlice(&s.sequence)
 	}
-	s.sequence = sequence
+	s.sequence = bytes.ToLower(sequence)
 }

 // Setting the qualities of the BioSequence.
--- a/pkg/obiseq/predicate.go
+++ b/pkg/obiseq/predicate.go
@ -209,9 +209,7 @@ func ExpressionPredicat(expression string) SequencePredicate {
 	f := func(sequence *BioSequence) bool {
 		value, err := exp.EvalBool(context.Background(),
 			map[string]interface{}{
-				"annot":     sequence.Annotations(),
-				"count":     sequence.Count(),
-				"seqlength": sequence.Len(),
+				"annotations":     sequence.Annotations(),
 				"sequence":  sequence,
 			},
 		)
--- a/pkg/obitools/obiclean/graph.go
+++ b/pkg/obitools/obiclean/graph.go
@ -18,6 +18,8 @@ import (

 type Ratio struct {
 	Sample string
+	SeqID  string
+	status string
 	From   int
 	To     int
 	CFrom  int
@ -97,12 +99,14 @@ func EmpiricalDistCsv(filename string, data [][]Ratio) {

 	bar := progressbar.NewOptions(len(data), pbopt...)

-	fmt.Fprintln(file, "Sample,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
+	fmt.Fprintln(file, "Sample,Father_id,Father_status,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length")
 	for code, dist := range data {
 		a1, a2 := intToNucPair(code)
 		for _, ratio := range dist {
-			fmt.Fprintf(file, "%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
+			fmt.Fprintf(file, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d\n",
 				ratio.Sample,
+				ratio.SeqID,
+				ratio.status,
 				a1, a2,
 				ratio.From,
 				ratio.To,
@ -463,7 +467,13 @@ func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
 			for _, edge := range seq.Edges {
 				father := (*seqs)[edge.Father]
 				if father.Weight >= minStatRatio && edge.Dist == 1 {
-					ratio[edge.NucPair] = append(ratio[edge.NucPair], Ratio{name, father.Weight, seq.Weight, father.Count, seq.Count, edge.Pos, father.Sequence.Len()})
+					ratio[edge.NucPair] = append(ratio[edge.NucPair],
+						Ratio{name,
+							father.Sequence.Id(), Status(father.Sequence)[name],
+							father.Weight, seq.Weight,
+							father.Count, seq.Count,
+							edge.Pos,
+							father.Sequence.Len()})
 				}
 			}

--- a/pkg/obitools/obiclean/obiclean.go
+++ b/pkg/obitools/obiclean/obiclean.go
@ -19,6 +19,7 @@ type seqPCR struct {
 	SonCount  int
 	AddedSons int
 	Edges     []Edge
+	Cluster   map[int]bool        // used as the set of head sequences associated to that sequence 
 }

 // buildSamples sorts the sequences by samples
@ -183,13 +184,53 @@ func GetMutation(sequence *obiseq.BioSequence) map[string]string {
 	return mutation
 }

+func GetCluster(sequence *obiseq.BioSequence) map[string]string {
+	annotation := sequence.Annotations()
+	icluster, ok := annotation["obiclean_cluster"]
+	var cluster map[string]string
+
+	if ok {
+		switch icluster := icluster.(type) {
+		case map[string]string:
+			cluster = icluster
+		case map[string]interface{}:
+			cluster = make(map[string]string)
+			for k, v := range icluster {
+				cluster[k] = fmt.Sprint(v)
+			}
+		}
+	} else {
+		cluster = make(map[string]string)
+		annotation["obiclean_cluster"] = cluster
+	}
+
+	return cluster
+}
+
+
+// func Cluster(sample map[string]*([]*seqPCR)) {
+// 	for _, graph := range sample {
+// 		for _, s := range *graph {
+// 			cluster := GetCluster(s.Sequence)
+// 			if len(s.Edges) > 0 {
+// 				for _, f := range s.Edges {
+
+// 				}	
+// 			} else {
+// 				cluster
+// 			}
+
+// 		}
+// 	}
+// }
+
 func Mutation(sample map[string]*([]*seqPCR)) {
 	for _, graph := range sample {
 		for _, s := range *graph {
 			for _, f := range s.Edges {
 				id := (*graph)[f.Father].Sequence.Id()
 				GetMutation(s.Sequence)[id] = fmt.Sprintf("(%c)->(%c)@%d",
-					f.From, f.To, f.Pos + 1)
+					f.From, f.To, f.Pos+1)
 			}
 		}
 	}
@ -277,14 +318,6 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
 		}
 	}

-	if IsSaveRatioTable() {
-		all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
-		EmpiricalDistCsv(RatioTableFilename(), all_ratio)
-	}
-
-	if SaveGraphToFiles() {
-		SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
-	}

 	Mutation(samples)

@ -310,6 +343,16 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch {
 		bar.Add(1)
 	}

+	if SaveGraphToFiles() {
+		SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
+	}
+
+	if IsSaveRatioTable() {
+		all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
+		EmpiricalDistCsv(RatioTableFilename(), all_ratio)
+	}
+
+
 	iter := annotateOBIClean(db, samples, SampleAttribute(), "NA")

 	if OnlyHead() {
--- a/pkg/obitools/obimultiplex/demultiplex.go
+++ b/pkg/obitools/obimultiplex/demultiplex.go
@ -56,5 +56,5 @@ func IExtractBarcode(iterator obiiter.IBioSequenceBatch) (obiiter.IBioSequenceBa
 	}
 	log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers())

-	return newIter, nil
+	return newIter.Speed("Demultiplexing"), nil
 }
--- a/pkg/obitools/obitag/obitag.go
+++ b/pkg/obitools/obitag/obitag.go
@ -66,7 +66,9 @@ func FindClosests(sequence *obiseq.BioSequence,

 		// log.Println(sequence.Id(),cw[j], maxe)
 		if runExact || (atMost <= (maxe + 1)) {
+			// if true {
 			lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix)
+			// fmt.Println(j, cw[j], lcs, alilength, alilength-lcs)
 			// lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix)
 			n++
 			if lcs == -1 {