diff --git a/cmd/obitools/obitag/main.go b/cmd/obitools/obitag/main.go index 02eec69..aa33b7a 100644 --- a/cmd/obitools/obitag/main.go +++ b/cmd/obitools/obitag/main.go @@ -1,6 +1,7 @@ package main import ( + "fmt" "log" "os" "runtime/pprof" @@ -37,4 +38,6 @@ func main() { identified := obitag.AssignTaxonomy(fs) obiconvert.WriteBioSequences(identified, true) + + fmt.Println("") } diff --git a/pkg/goutils/goutils.go b/pkg/goutils/goutils.go index 58ac343..4015f76 100644 --- a/pkg/goutils/goutils.go +++ b/pkg/goutils/goutils.go @@ -66,7 +66,56 @@ func InterfaceToInt(i interface{}) (val int, err error) { case uint64: val = int(t) // standardizes across systems default: - err = &NotABoolean{"value attribute cannot be casted to an integer"} + err = &NotAnInteger{"value attribute cannot be casted to an integer"} + } + return +} + +// NotAnInteger defines a new type of Error : "NotAnInteger" +type NotAnFloat64 struct { + message string +} + +// Error() retreives the error message associated to the "NotAnInteger" +// error. Tha addition of that Error message make the "NotAnInteger" +// complying with the error interface +func (m *NotAnFloat64) Error() string { + return m.message +} + +// InterfaceToInt converts a interface{} to an integer value if possible. +// If not a "NotAnInteger" error is returned via the err +// return value and val is set to 0. +func InterfaceToFloat64(i interface{}) (val float64, err error) { + + err = nil + val = 0 + + switch t := i.(type) { + case int: + val = float64(t) + case int8: + val = float64(t) // standardizes across systems + case int16: + val = float64(t) // standardizes across systems + case int32: + val = float64(t) // standardizes across systems + case int64: + val = float64(t) // standardizes across systems + case float32: + val = float64(t) // standardizes across systems + case float64: + val = t // standardizes across systems + case uint8: + val = float64(t) // standardizes across systems + case uint16: + val = float64(t) // standardizes across systems + case uint32: + val = float64(t) // standardizes across systems + case uint64: + val = float64(t) // standardizes across systems + default: + err = &NotAnFloat64{"value attribute cannot be casted to a float value"} } return } @@ -109,6 +158,45 @@ func InterfaceToIntMap(i interface{}) (val map[string]int, err error) { return } +// NotABoolean defines a new type of Error : "NotAMapInt" +type NotAMapFloat64 struct { + message string +} + +// Error() retreives the error message associated to the "NotAnInteger" +// error. Tha addition of that Error message make the "NotAnInteger" +// complying with the error interface +func (m *NotAMapFloat64) Error() string { + return m.message +} + +func InterfaceToFloat64Map(i interface{}) (val map[string]float64, err error) { + err = nil + + switch i := i.(type) { + case map[string]float64: + val = i + case map[string]interface{}: + val = make(map[string]float64, len(i)) + for k, v := range i { + val[k], err = InterfaceToFloat64(v) + if err != nil { + return + } + } + case map[string]int: + val = make(map[string]float64, len(i)) + for k, v := range i { + val[k] = float64(v) + } + default: + err = &NotAMapFloat64{"value attribute cannot be casted to a map[string]float64"} + } + + return +} + + // NotABoolean defines a new type of Error : "NotABoolean" type NotABoolean struct { message string diff --git a/pkg/obialign/fastlcs.go b/pkg/obialign/fastlcs.go index 6bbad17..50bd27d 100644 --- a/pkg/obialign/fastlcs.go +++ b/pkg/obialign/fastlcs.go @@ -199,7 +199,6 @@ func FastLCSScore(seqA, seqB *obiseq.BioSequence, maxError int, buffer *[]uint64 Sleft = _notavail default: Sdiag = previous[x] - if bA[j-1] == bB[i-1] { Sdiag = _incscore(Sdiag) } diff --git a/pkg/obieval/language.go b/pkg/obieval/language.go index 6494a7a..d7aea9a 100644 --- a/pkg/obieval/language.go +++ b/pkg/obieval/language.go @@ -5,6 +5,131 @@ import ( "github.com/PaesslerAG/gval" ) +func maxIntVector(values []int) float64 { + m := values[0] + for _,v := range values { + if v > m { + m = v + } + } + + return float64(m) +} + +func maxIntMap(values map[string]int) float64 { + var m int + first := true + for _,v := range values { + if first { + first = false + m = v + } else { + if v > m { + m = v + } + } + } + + return float64(m) +} + +func minIntVector(values []int) float64 { + m := values[0] + for _,v := range values { + if v < m { + m = v + } + } + + return float64(m) +} + +func minIntMap(values map[string]int) float64 { + var m int + first := true + for _,v := range values { + if first { + first = false + m = v + } else { + if v < m { + m = v + } + } + } + + return float64(m) +} + + +func maxFloatVector(values []float64) float64 { + m := values[0] + for _,v := range values { + if v > m { + m = v + } + } + + return m +} + +func maxFloatMap(values map[string]float64) float64 { + var m float64 + first := true + for _,v := range values { + if first { + first = false + m = v + } else { + if v > m { + m = v + } + } + } + + return m +} + +func minFloatVector(values []float64) float64 { + m := values[0] + for _,v := range values { + if v < m { + m = v + } + } + + return m +} + +func minFloatMap(values map[string]float64) float64 { + var m float64 + first := true + for _,v := range values { + if first { + first = false + m = v + } else { + if v < m { + m = v + } + } + } + + return m +} + +// func maxNumeric(args ...interface{}) (interface{}, error) { +// var m float64 +// first := true + +// for _, v := range args { +// switch { +// case +// } +// } + +// } + var OBILang = gval.NewLanguage( gval.Full(), gval.Function("len", func(args ...interface{}) (interface{}, error) { diff --git a/pkg/obiformats/ecopcr_read.go b/pkg/obiformats/ecopcr_read.go index a313af0..eef23e6 100644 --- a/pkg/obiformats/ecopcr_read.go +++ b/pkg/obiformats/ecopcr_read.go @@ -1,6 +1,7 @@ package obiformats import ( + "bytes" "compress/gzip" "encoding/csv" "fmt" @@ -67,7 +68,7 @@ func __read_ecopcr_bioseq__(file *__ecopcr_file__) (*obiseq.BioSequence, error) comment = strings.TrimSpace(record[19]) } - bseq := obiseq.NewBioSequence(name, sequence, comment) + bseq := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment) annotation := bseq.Annotations() annotation["ac"] = name diff --git a/pkg/obiformats/embl_read.go b/pkg/obiformats/embl_read.go index a669a4f..3388497 100644 --- a/pkg/obiformats/embl_read.go +++ b/pkg/obiformats/embl_read.go @@ -128,7 +128,7 @@ func _ParseEmblFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) { } case line == "//": sequence := obiseq.NewBioSequence(id, - seqBytes.Bytes(), + bytes.ToLower(seqBytes.Bytes()), defBytes.String()) sequence.SetFeatures(featBytes.Bytes()) diff --git a/pkg/obiformats/fastseq_read.go b/pkg/obiformats/fastseq_read.go index 8bf31fc..65193af 100644 --- a/pkg/obiformats/fastseq_read.go +++ b/pkg/obiformats/fastseq_read.go @@ -7,6 +7,7 @@ package obiformats import "C" import ( + "bytes" "fmt" "os" "unsafe" @@ -38,7 +39,7 @@ func _FastseqReader(seqfile C.fast_kseq_p, comment = "" } - rep := obiseq.NewBioSequence(name, sequence, comment) + rep := obiseq.NewBioSequence(name, bytes.ToLower(sequence), comment) if s.qual.l > C.ulong(0) { cquality := unsafe.Slice(s.qual.s, C.int(s.qual.l)) diff --git a/pkg/obiformats/genbank_read.go b/pkg/obiformats/genbank_read.go index bc4020d..ec395e3 100644 --- a/pkg/obiformats/genbank_read.go +++ b/pkg/obiformats/genbank_read.go @@ -84,7 +84,7 @@ func _ParseGenbankFile(input <-chan _FileChunk, out obiiter.IBioSequenceBatch) { case line == "//": sequence := obiseq.NewBioSequence(id, - seqBytes.Bytes(), + bytes.ToLower(seqBytes.Bytes()), defBytes.String()) state = inHeader diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 756ce60..e75e05e 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -11,6 +11,7 @@ package obiseq import ( + "bytes" "crypto/md5" "fmt" "strconv" @@ -370,7 +371,7 @@ func (s *BioSequence) SetSequence(sequence []byte) { if s.sequence != nil { RecycleSlice(&s.sequence) } - s.sequence = sequence + s.sequence = bytes.ToLower(sequence) } // Setting the qualities of the BioSequence. diff --git a/pkg/obiseq/predicate.go b/pkg/obiseq/predicate.go index 37d2557..fa3b24b 100644 --- a/pkg/obiseq/predicate.go +++ b/pkg/obiseq/predicate.go @@ -209,9 +209,7 @@ func ExpressionPredicat(expression string) SequencePredicate { f := func(sequence *BioSequence) bool { value, err := exp.EvalBool(context.Background(), map[string]interface{}{ - "annot": sequence.Annotations(), - "count": sequence.Count(), - "seqlength": sequence.Len(), + "annotations": sequence.Annotations(), "sequence": sequence, }, ) diff --git a/pkg/obitools/obiclean/graph.go b/pkg/obitools/obiclean/graph.go index ae579e7..cf3ea75 100644 --- a/pkg/obitools/obiclean/graph.go +++ b/pkg/obitools/obiclean/graph.go @@ -18,6 +18,8 @@ import ( type Ratio struct { Sample string + SeqID string + status string From int To int CFrom int @@ -97,12 +99,14 @@ func EmpiricalDistCsv(filename string, data [][]Ratio) { bar := progressbar.NewOptions(len(data), pbopt...) - fmt.Fprintln(file, "Sample,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length") + fmt.Fprintln(file, "Sample,Father_id,Father_status,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length") for code, dist := range data { a1, a2 := intToNucPair(code) for _, ratio := range dist { - fmt.Fprintf(file, "%s,%c,%c,%d,%d,%d,%d,%d,%d\n", + fmt.Fprintf(file, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d\n", ratio.Sample, + ratio.SeqID, + ratio.status, a1, a2, ratio.From, ratio.To, @@ -463,7 +467,13 @@ func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio { for _, edge := range seq.Edges { father := (*seqs)[edge.Father] if father.Weight >= minStatRatio && edge.Dist == 1 { - ratio[edge.NucPair] = append(ratio[edge.NucPair], Ratio{name, father.Weight, seq.Weight, father.Count, seq.Count, edge.Pos, father.Sequence.Len()}) + ratio[edge.NucPair] = append(ratio[edge.NucPair], + Ratio{name, + father.Sequence.Id(), Status(father.Sequence)[name], + father.Weight, seq.Weight, + father.Count, seq.Count, + edge.Pos, + father.Sequence.Len()}) } } diff --git a/pkg/obitools/obiclean/obiclean.go b/pkg/obitools/obiclean/obiclean.go index 63bcc67..ed40a11 100644 --- a/pkg/obitools/obiclean/obiclean.go +++ b/pkg/obitools/obiclean/obiclean.go @@ -19,6 +19,7 @@ type seqPCR struct { SonCount int AddedSons int Edges []Edge + Cluster map[int]bool // used as the set of head sequences associated to that sequence } // buildSamples sorts the sequences by samples @@ -183,13 +184,53 @@ func GetMutation(sequence *obiseq.BioSequence) map[string]string { return mutation } +func GetCluster(sequence *obiseq.BioSequence) map[string]string { + annotation := sequence.Annotations() + icluster, ok := annotation["obiclean_cluster"] + var cluster map[string]string + + if ok { + switch icluster := icluster.(type) { + case map[string]string: + cluster = icluster + case map[string]interface{}: + cluster = make(map[string]string) + for k, v := range icluster { + cluster[k] = fmt.Sprint(v) + } + } + } else { + cluster = make(map[string]string) + annotation["obiclean_cluster"] = cluster + } + + return cluster +} + + +// func Cluster(sample map[string]*([]*seqPCR)) { +// for _, graph := range sample { +// for _, s := range *graph { +// cluster := GetCluster(s.Sequence) +// if len(s.Edges) > 0 { +// for _, f := range s.Edges { + +// } +// } else { +// cluster +// } + +// } +// } +// } + func Mutation(sample map[string]*([]*seqPCR)) { for _, graph := range sample { for _, s := range *graph { for _, f := range s.Edges { id := (*graph)[f.Father].Sequence.Id() GetMutation(s.Sequence)[id] = fmt.Sprintf("(%c)->(%c)@%d", - f.From, f.To, f.Pos + 1) + f.From, f.To, f.Pos+1) } } } @@ -277,14 +318,6 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { } } - if IsSaveRatioTable() { - all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate()) - EmpiricalDistCsv(RatioTableFilename(), all_ratio) - } - - if SaveGraphToFiles() { - SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate()) - } Mutation(samples) @@ -310,6 +343,16 @@ func IOBIClean(itertator obiiter.IBioSequenceBatch) obiiter.IBioSequenceBatch { bar.Add(1) } + if SaveGraphToFiles() { + SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate()) + } + + if IsSaveRatioTable() { + all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate()) + EmpiricalDistCsv(RatioTableFilename(), all_ratio) + } + + iter := annotateOBIClean(db, samples, SampleAttribute(), "NA") if OnlyHead() { diff --git a/pkg/obitools/obimultiplex/demultiplex.go b/pkg/obitools/obimultiplex/demultiplex.go index 1ae0e07..2a05f74 100644 --- a/pkg/obitools/obimultiplex/demultiplex.go +++ b/pkg/obitools/obimultiplex/demultiplex.go @@ -56,5 +56,5 @@ func IExtractBarcode(iterator obiiter.IBioSequenceBatch) (obiiter.IBioSequenceBa } log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers()) - return newIter, nil + return newIter.Speed("Demultiplexing"), nil } diff --git a/pkg/obitools/obitag/obitag.go b/pkg/obitools/obitag/obitag.go index 9111bf1..e861f84 100644 --- a/pkg/obitools/obitag/obitag.go +++ b/pkg/obitools/obitag/obitag.go @@ -66,7 +66,9 @@ func FindClosests(sequence *obiseq.BioSequence, // log.Println(sequence.Id(),cw[j], maxe) if runExact || (atMost <= (maxe + 1)) { + // if true { lcs, alilength := obialign.FastLCSScore(sequence, ref, maxe+1, &matrix) + // fmt.Println(j, cw[j], lcs, alilength, alilength-lcs) // lcs, alilength := obialign.LCSScore(sequence, ref, maxe+1, matrix) n++ if lcs == -1 {