diff --git a/pkg/obialign/fastlcsegf.go b/pkg/obialign/fastlcsegf.go index d59ac46..bce5d06 100644 --- a/pkg/obialign/fastlcsegf.go +++ b/pkg/obialign/fastlcsegf.go @@ -4,33 +4,6 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" ) -var _iupac = [26]byte{ - // a b c d e f - 1, 14, 2, 13, 0, 0, - // g h i j k l - 4, 11, 0, 0, 12, 0, - // m n o p q r - 3, 15, 0, 0, 0, 5, - // s t u v w x - 6, 8, 8, 13, 9, 0, - // y z - 10, 0, -} - -func _samenuc(a, b byte) bool { - if (a >= 'A') && (a <= 'Z') { - a |= 32 - } - if (b >= 'A') && (b <= 'Z') { - b |= 32 - } - - if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') { - return (_iupac[a-'a'] & _iupac[b-'a']) > 0 - } - return a == b -} - // FastLCSEGFScoreByte calculates the score of the Longest Common Subsequence (LCS) between two byte slices. // // The score is calculated using the following scoring matrix: @@ -165,7 +138,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ default: // We are in the middle of the matrix Sdiag = _incpath(previous[x]) - if _samenuc(bA[j-1], bB[i-1]) { + if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) { Sdiag = _incscore(Sdiag) } @@ -265,7 +238,7 @@ func FastLCSEGFScoreByte(bA, bB []byte, maxError int, endgapfree bool, buffer *[ Sleft = _notavail default: Sdiag = _incpath(previous[x]) - if _samenuc(bA[j-1], bB[i-1]) { + if obiseq.SameIUPACNuc(bA[j-1], bB[i-1]) { Sdiag = _incscore(Sdiag) } diff --git a/pkg/obialign/locatepattern.go b/pkg/obialign/locatepattern.go index 23b5538..d5170e6 100644 --- a/pkg/obialign/locatepattern.go +++ b/pkg/obialign/locatepattern.go @@ -1,6 +1,9 @@ package obialign -import log "github.com/sirupsen/logrus" +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + log "github.com/sirupsen/logrus" +) // buffIndex converts a pair of coordinates (i, j) into a linear index in a matrix // of size width x width. The coordinates are (-1)-indexed, and the linear index @@ -69,7 +72,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) { // Mismatch score = -1 // Match score = 0 match := -1 - if _samenuc(pattern[j], sequence[i]) { + if obiseq.SameIUPACNuc(pattern[j], sequence[i]) { match = 0 } @@ -103,7 +106,7 @@ func LocatePattern(id string, pattern, sequence []byte) (int, int, int) { // Mismatch score = -1 // Match score = 0 match := -1 - if _samenuc(pattern[jmax], sequence[i]) { + if obiseq.SameIUPACNuc(pattern[jmax], sequence[i]) { match = 0 } diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index b326b22..76d4cf6 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -8,7 +8,7 @@ import ( // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "6d204f6" +var _Commit = "29bf4ce" var _Version = "Release 4.4.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/iupac_nog.go b/pkg/obiseq/iupac_nog.go new file mode 100644 index 0000000..a3fba9b --- /dev/null +++ b/pkg/obiseq/iupac_nog.go @@ -0,0 +1,28 @@ +package obiseq + +var _iupac = [26]byte{ + // a b c d e f + 1, 14, 2, 13, 0, 0, + // g h i j k l + 4, 11, 0, 0, 12, 0, + // m n o p q r + 3, 15, 0, 0, 0, 5, + // s t u v w x + 6, 8, 8, 13, 9, 0, + // y z + 10, 0, +} + +func SameIUPACNuc(a, b byte) bool { + if (a >= 'A') && (a <= 'Z') { + a |= 32 + } + if (b >= 'A') && (b <= 'Z') { + b |= 32 + } + + if (a >= 'a') && (a <= 'z') && (b >= 'a') && (b <= 'z') { + return (_iupac[a-'a'] & _iupac[b-'a']) > 0 + } + return a == b +} diff --git a/pkg/obitools/obiclean/chimera.go b/pkg/obitools/obiclean/chimera.go index b13637a..fda821e 100644 --- a/pkg/obitools/obiclean/chimera.go +++ b/pkg/obitools/obiclean/chimera.go @@ -2,12 +2,10 @@ package obiclean import ( "fmt" - "os" "sort" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" - "github.com/schollz/progressbar/v3" log "github.com/sirupsen/logrus" ) @@ -21,13 +19,13 @@ func commonPrefix(a, b *obiseq.BioSequence) int { as := a.Sequence() bs := b.Sequence() - for i < l && as[i] == bs[i] { + for i < l && obiseq.SameIUPACNuc(as[i], bs[i]) { i++ } - if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) { - log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i]) - } + // if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) { + // log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i]) + // } return i } @@ -44,29 +42,97 @@ func commonSuffix(a, b *obiseq.BioSequence) int { bs := b.Sequence() l := 0 - for i >= 0 && j >= 0 && as[i] == bs[j] { + for i >= 0 && j >= 0 && obiseq.SameIUPACNuc(as[i], bs[j]) { i-- j-- l++ } - if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) { - log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:]) - } - // obilog.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:]) + // if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) { + // log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:]) + // } + // log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:]) return l } +// oneDifference return true if s1 and s2 differ by exactly 1 operation +// (substitution, insertion or deletion) +func oneDifference(s1, s2 []byte) bool { + l1 := len(s1) + l2 := len(s2) + + // Case 1: same lengths → test substitution + if l1 == l2 { + diff := 0 + for i := 0; i < l1; i++ { + if s1[i] != s2[i] { + diff++ + if diff > 1 { + return false + } + } + } + return diff == 1 + } + + // Case 2: difference of 1 character → insertion/deletion + if l1 == l2+1 { + for i := 0; i < l1; i++ { + if string(s1[:i])+string(s1[i+1:]) == string(s2) { + return true + } + } + return false + } + if l2 == l1+1 { + for i := 0; i < l2; i++ { + if string(s2[:i])+string(s2[i+1:]) == string(s1) { + return true + } + } + return false + } + + // Case 3: difference > 1 character + return false +} + +func GetChimera(sequence *obiseq.BioSequence) map[string]string { + annotation := sequence.Annotations() + iobistatus, ok := annotation["chimera"] + var chimera map[string]string + var err error + + if ok { + switch iobistatus := iobistatus.(type) { + case map[string]string: + chimera = iobistatus + case map[string]interface{}: + chimera = make(map[string]string) + for k, v := range iobistatus { + chimera[k], err = obiutils.InterfaceToString(v) + if err != nil { + log.Panicf("chimera attribute of sequence %s must be castable to a map[string]string", sequence.Id()) + } + } + } + } else { + chimera = make(map[string]string) + annotation["chimera"] = chimera + } + + return chimera +} + +// add the suffix/prefix comparisons to detect chimeras (handle IUPAC codes) func AnnotateChimera(samples map[string]*[]*seqPCR) { w := func(sample string, seqs *[]*seqPCR) { ls := len(*seqs) - cp := make([]int, ls) - cs := make([]int, ls) - pcrs := make([]*seqPCR, 0, ls) + // select only sequences without edges (head sequences) for _, s := range *seqs { if len(s.Edges) == 0 { pcrs = append(pcrs, s) @@ -75,66 +141,63 @@ func AnnotateChimera(samples map[string]*[]*seqPCR) { lp := len(pcrs) + // sort by increasing weight (like increasing abundance) sort.Slice(pcrs, func(i, j int) bool { return pcrs[i].Weight < pcrs[j].Weight }) for i, s := range pcrs { - for j := i + 1; j < lp; j++ { - s2 := pcrs[j] - cp[j] = commonPrefix(s.Sequence, s2.Sequence) - cs[j] = commonSuffix(s.Sequence, s2.Sequence) - } + seqRef := s.Sequence + L := seqRef.Len() - var cm map[string]string - var err error + maxLeft, maxRight := 0, 0 + var nameLeft, nameRight string - chimera, ok := s.Sequence.GetAttribute("chimera") + // looking for potential parents + for j := 0; j < lp; j++ { + if j == i { + continue + } + seqParent := pcrs[j].Sequence - if !ok { - cm = map[string]string{} - } else { - cm, err = obiutils.InterfaceToStringMap(chimera) - if err != nil { - log.Fatalf("type of chimera not map[string]string: %T (%v)", - chimera, err) + // Check abundance: parent must be more abundant + if pcrs[j].Weight <= s.Weight { + continue + } + + // Check edit distance (skip if only one diff, supposed to never happen) + if oneDifference(seqRef.Sequence(), seqParent.Sequence()) { + continue + } + + // Common prefix + left := commonPrefix(seqRef, seqParent) + if left > maxLeft { + maxLeft = left + nameLeft = seqParent.Id() + } + + // Common suffix + right := commonSuffix(seqRef, seqParent) + if right > maxRight { + maxRight = right + nameRight = seqParent.Id() } } - ls := s.Sequence.Len() + // Select parents with longuest prefix/suffix + // Condition prefix+suffix covers the sequence and sequence not include into parent + if maxLeft >= L-maxRight && maxLeft > 0 && maxRight < L { - for k := i + 1; k < lp; k++ { - for l := i + 1; l < lp; l++ { - if k != l && cp[k]+cs[l] == ls { - cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)", - pcrs[k].Sequence.Id(), - pcrs[l].Sequence.Id(), - cp[k]) - } - } - } - - if len(cm) > 0 { - s.Sequence.SetAttribute("chimera", cm) + chimeraMap := GetChimera(s.Sequence) + // overlap sequence + overlap := seqRef.Sequence()[L-maxRight : maxLeft] + chimeraMap[sample] = fmt.Sprintf("{%s}/{%s}@(%s)(%d)(%d)(%d)", nameLeft, nameRight, overlap, L-maxRight, maxLeft, len(overlap)) } } - } - pbopt := make([]progressbar.Option, 0, 5) - pbopt = append(pbopt, - progressbar.OptionSetWriter(os.Stderr), - progressbar.OptionSetWidth(15), - progressbar.OptionShowIts(), - progressbar.OptionSetPredictTime(true), - progressbar.OptionSetDescription("[Chimera detection]"), - ) - - bar := progressbar.NewOptions(len(samples), pbopt...) - for sn, sqs := range samples { w(sn, sqs) - bar.Add(1) } - } diff --git a/pkg/obitools/obimatrix/obimatrix.go b/pkg/obitools/obimatrix/obimatrix.go index a08a31c..0a77159 100644 --- a/pkg/obitools/obimatrix/obimatrix.go +++ b/pkg/obitools/obimatrix/obimatrix.go @@ -3,6 +3,7 @@ package obimatrix import ( "encoding/csv" "os" + "slices" "sort" "sync" @@ -11,41 +12,55 @@ import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "golang.org/x/exp/maps" ) -type MatrixData map[string]map[string]interface{} +type MatrixData struct { + matrix map[string]map[string]interface{} + attributes map[string]map[string]interface{} + attributeList []string + naValue string +} // MakeMatrixData generates a MatrixData instance. // // No parameters. // Returns a MatrixData. -func MakeMatrixData() MatrixData { - return make(MatrixData) +func MakeMatrixData(naValue string, attributes ...string) MatrixData { + return MatrixData{ + matrix: make(map[string]map[string]interface{}), + attributes: make(map[string]map[string]interface{}), + attributeList: slices.Clone(attributes), + naValue: naValue, + } } // NewMatrixData creates a new instance of MatrixData. // // It does not take any parameters. // It returns a pointer to a MatrixData object. -func NewMatrixData() *MatrixData { - m := make(MatrixData) +func NewMatrixData(naValue string, attributes ...string) *MatrixData { + m := MakeMatrixData(naValue, attributes...) return &m } // TransposeMatrixData transposes the MatrixData. // // It takes no parameters. +// If the input matrix has attributes, they are lost. +// A unique attribute "id" is added to store the column ids of the input matrix. // It returns a pointer to the transposed MatrixData. func (matrix *MatrixData) TransposeMatrixData() *MatrixData { - m := make(MatrixData) - for k, v := range *matrix { + m := MakeMatrixData(matrix.naValue, "id") + for k, v := range *&matrix.matrix { for kk, vv := range v { - if _, ok := m[kk]; !ok { - m[kk] = make(map[string]interface{}) + if _, ok := m.matrix[kk]; !ok { + m.matrix[kk] = make(map[string]interface{}) } - m[kk][k] = vv + m.matrix[kk][k] = vv + m.attributes[kk] = map[string]interface{}{"id": k} } } return &m @@ -58,11 +73,12 @@ func (matrix *MatrixData) TransposeMatrixData() *MatrixData { // Returns the pointer to the merged MatrixData. func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData { - for k := range *data2 { - if _, ok := (*data1)[k]; ok { + for k := range data2.matrix { + if _, ok := data1.matrix[k]; ok { log.Panicf("Sequence Id %s exists at least twice in the data set", k) } else { - (*data1)[k] = (*data2)[k] + data1.matrix[k] = data2.matrix[k] + data1.attributes[k] = data2.attributes[k] } } @@ -77,21 +93,74 @@ func (data1 *MatrixData) MergeMatrixData(data2 *MatrixData) *MatrixData { // // Returns: // - *MatrixData: The updated MatrixData object. -func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string) *MatrixData { +func (data *MatrixData) Update(s *obiseq.BioSequence, mapkey string, strict bool) *MatrixData { + + sid := s.Id() + + if _, ok := data.matrix[sid]; ok { + log.Panicf("Sequence Id %s exists at least twice in the data set", sid) + } if v, ok := s.GetAttribute(mapkey); ok { if m, ok := v.(*obiseq.StatsOnValues); ok { m.RLock() - (*data)[s.Id()] = obiutils.MapToMapInterface(m.Map()) + data.matrix[sid] = obiutils.MapToMapInterface(m.Map()) m.RUnlock() } else if obiutils.IsAMap(v) { - (*data)[s.Id()] = obiutils.MapToMapInterface(v) + data.matrix[sid] = obiutils.MapToMapInterface(v) } else { log.Panicf("Attribute %s is not a map in the sequence %s", mapkey, s.Id()) } } else { - log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id()) + if strict { + log.Panicf("Attribute %s does not exist in the sequence %s", mapkey, s.Id()) + } + data.matrix[sid] = make(map[string]interface{}) } + attrs := make(map[string]interface{}, len(data.attributeList)) + for _, attrname := range data.attributeList { + var value interface{} + ok := false + switch attrname { + case "id": + value = s.Id + ok = true + case "count": + value = s.Count() + ok = true + case "taxon": + taxon := s.Taxon(nil) + if taxon != nil { + value = taxon.String() + + } else { + value = s.Taxid() + } + ok = true + case "sequence": + value = s.String() + ok = true + case "quality": + if s.HasQualities() { + l := s.Len() + q := s.Qualities() + ascii := make([]byte, l) + quality_shift := obidefault.WriteQualitiesShift() + for j := 0; j < l; j++ { + ascii[j] = uint8(q[j]) + uint8(quality_shift) + } + value = string(ascii) + ok = true + } + default: + value, ok = s.GetAttribute(attrname) + } + if ok { + attrs[attrname] = value + } + } + data.attributes[sid] = attrs + return data } @@ -101,6 +170,49 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData { waiter := sync.WaitGroup{} mapAttribute := CLIMapAttribute() + attribList := make([]string, 0) + + if obicsv.CLIPrintId() { + attribList = append(attribList, "id") + } + + if obicsv.CLIPrintCount() { + attribList = append(attribList, "count") + } + + if obicsv.CLIPrintTaxon() { + attribList = append(attribList, "taxon") + } + + if obicsv.CLIPrintDefinition() { + attribList = append(attribList, "definition") + } + + if obicsv.CLIPrintSequence() { + attribList = append(attribList, "sequence") + } + + if obicsv.CLIPrintQuality() { + attribList = append(attribList, "qualities") + } + + attribList = append(attribList, obicsv.CLIToBeKeptAttributes()...) + + if obicsv.CLIAutoColumns() { + if iterator.Next() { + batch := iterator.Get() + if len(batch.Slice()) == 0 { + log.Panicf("first batch should not be empty") + } + auto_slot := batch.Slice().AttributeKeys(true, true).Members() + slices.Sort(auto_slot) + attribList = append(attribList, auto_slot...) + iterator.PushBack() + } + } + + naValue := obicsv.CLINAValue() + strict := CLIStrict() summaries := make([]*MatrixData, nproc) @@ -109,7 +221,7 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData { for iseq.Next() { batch := iseq.Get() for _, seq := range batch.Slice() { - summary.Update(seq, mapAttribute) + summary.Update(seq, mapAttribute, strict) } } waiter.Done() @@ -117,11 +229,11 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData { waiter.Add(nproc) - summaries[0] = NewMatrixData() + summaries[0] = NewMatrixData(naValue, attribList...) go ff(iterator, summaries[0]) for i := 1; i < nproc; i++ { - summaries[i] = NewMatrixData() + summaries[i] = NewMatrixData(naValue, attribList...) go ff(iterator.Split(), summaries[i]) } @@ -138,7 +250,7 @@ func IMatrix(iterator obiiter.IBioSequence) *MatrixData { } func CLIWriteCSVToStdout(matrix *MatrixData) { - navalue := CLINaValue() + navalue := CLIMapNaValue() csvwriter := csv.NewWriter(os.Stdout) if CLITranspose() { @@ -147,33 +259,45 @@ func CLIWriteCSVToStdout(matrix *MatrixData) { samples := obiutils.NewSet[string]() - for _, v := range *matrix { + for _, v := range matrix.matrix { samples.Add(maps.Keys(v)...) } osamples := samples.Members() sort.Strings(osamples) - - columns := make([]string, 1, len(osamples)+1) - columns[0] = "id" + columns := make([]string, 0, len(osamples)+len(matrix.attributeList)) + columns = append(columns, matrix.attributeList...) columns = append(columns, osamples...) + header := slices.Clone(columns) + csvwriter.Write(columns) + nattribs := len(matrix.attributeList) - for k, data := range *matrix { - columns = columns[0:1] - columns[0] = k - for _, kk := range osamples { - if v, ok := data[kk]; ok { - vs, err := obiutils.InterfaceToString(v) - if err != nil { - log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk) + for k, data := range matrix.matrix { + attrs := matrix.attributes[k] + for i, kk := range header { + if i < nattribs { + if v, ok := attrs[kk]; ok { + vs, err := obiutils.InterfaceToString(v) + if err != nil { + log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk) + } + columns[i] = vs + } else { + columns[i] = matrix.naValue } - columns = append(columns, vs) } else { - columns = append(columns, navalue) + if v, ok := data[kk]; ok { + vs, err := obiutils.InterfaceToString(v) + if err != nil { + log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, k, kk) + } + columns[i] = vs + } else { + columns[i] = navalue + } } - } csvwriter.Write(columns) } @@ -187,8 +311,8 @@ func CLIWriteThreeColumnsToStdout(matrix *MatrixData) { csvwriter := csv.NewWriter(os.Stdout) csvwriter.Write([]string{"id", sname, vname}) - for seqid := range *matrix { - for attr, v := range (*matrix)[seqid] { + for seqid := range matrix.matrix { + for attr, v := range matrix.matrix[seqid] { vs, err := obiutils.InterfaceToString(v) if err != nil { log.Panicf("value %v in sequence %s for attribute %s cannot be casted to a string", v, seqid, attr) diff --git a/pkg/obitools/obimatrix/options.go b/pkg/obitools/obimatrix/options.go index 5c2216e..881cb02 100644 --- a/pkg/obitools/obimatrix/options.go +++ b/pkg/obitools/obimatrix/options.go @@ -6,6 +6,7 @@ package obimatrix import ( "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv" "github.com/DavidGamba/go-getoptions" ) @@ -14,7 +15,8 @@ var __transpose__ = true var __mapAttribute__ = "merged_sample" var __valueName__ = "count" var __sampleName__ = "sample" -var __NAValue__ = "0" +var __MapNAValue__ = "0" +var __AllowEmpty__ = false func MatrixOptionSet(options *getoptions.GetOpt) { options.BoolVar(&__threeColumns__, "three-columns", false, @@ -32,12 +34,16 @@ func MatrixOptionSet(options *getoptions.GetOpt) { options.StringVar(&__sampleName__, "sample-name", __sampleName__, options.Description("Name of the coulumn containing the sample names in the three column format.")) - options.StringVar(&__NAValue__, "na-value", __NAValue__, + options.StringVar(&__MapNAValue__, "map-na-value", __MapNAValue__, options.Description("Value used when the map attribute is not defined for a sequence.")) + + options.BoolVar(&__AllowEmpty__, "allow-empty", __AllowEmpty__, + options.Description("Allow sequences with empty map")) } func OptionSet(options *getoptions.GetOpt) { MatrixOptionSet(options) + obicsv.CSVOptionSet(options) obiconvert.InputOptionSet(options) } @@ -57,8 +63,8 @@ func CLISampleName() string { return __sampleName__ } -func CLINaValue() string { - return __NAValue__ +func CLIMapNaValue() string { + return __MapNAValue__ } func CLIMapAttribute() string { @@ -68,3 +74,7 @@ func CLIMapAttribute() string { func CLITranspose() bool { return __transpose__ } + +func CLIStrict() bool { + return !__AllowEmpty__ +}