From 1b43fa42474828d42e63d35fd96824183a37ec54 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Thu, 6 Jun 2024 23:11:13 +0200 Subject: [PATCH] First complete version of obimultiplex2 Former-commit-id: 170593bd597c7914d3f1fd3d2b865186d7f11966 --- pkg/obiformats/ngsfilter_read.go | 115 +++++- pkg/obiformats/universal_read.go | 2 +- pkg/obingslibrary/marker.go | 125 ++++++- pkg/obingslibrary/match.go | 17 +- pkg/obingslibrary/multimatch.go | 490 +++++++++++++++++++++----- pkg/obingslibrary/ngslibrary.go | 182 ++++++++-- pkg/obingslibrary/worker.go | 8 +- pkg/obioptions/version.go | 2 +- pkg/obitools/obimultiplex2/options.go | 2 +- 9 files changed, 778 insertions(+), 165 deletions(-) diff --git a/pkg/obiformats/ngsfilter_read.go b/pkg/obiformats/ngsfilter_read.go index ea1264d..c305df0 100644 --- a/pkg/obiformats/ngsfilter_read.go +++ b/pkg/obiformats/ngsfilter_read.go @@ -67,24 +67,22 @@ func _parseMainNGSFilterTags(text string) obingslibrary.TagPair { } } -func _parseMainNGSFilter(text string) (obingslibrary.PrimerPair, obingslibrary.TagPair, string, string, bool, bool) { +func _parseMainNGSFilter(text string) (obingslibrary.PrimerPair, obingslibrary.TagPair, string, string, bool) { fields := strings.Fields(text) if len(fields) != 6 { - return obingslibrary.PrimerPair{}, obingslibrary.TagPair{}, "", "", false, false + return obingslibrary.PrimerPair{}, obingslibrary.TagPair{}, "", "", false } tags := _parseMainNGSFilterTags(fields[2]) - partial := fields[5] == "T" || fields[5] == "t" return obingslibrary.PrimerPair{ - Forward: fields[3], - Reverse: fields[4], + Forward: strings.ToLower(fields[3]), + Reverse: strings.ToLower(fields[4]), }, tags, fields[0], fields[1], - partial, true } @@ -164,7 +162,12 @@ func OBIMimeNGSFilterTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, e } func ReadNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { - mimetype, newReader, err := OBIMimeNGSFilterTypeGuesser(reader) + var ngsfilter *obingslibrary.NGSLibrary + var err error + var mimetype *mimetype.MIME + var newReader io.Reader + + mimetype, newReader, err = OBIMimeNGSFilterTypeGuesser(reader) if err != nil { return nil, err @@ -172,12 +175,22 @@ func ReadNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { log.Infof("NGSFilter configuration mimetype: %s", mimetype.String()) - if mimetype.String() == "text/ngsfilter-csv" { - return ReadCSVNGSFilter(newReader) + if mimetype.String() == "text/ngsfilter-csv" || mimetype.String() == "text/csv" { + ngsfilter, err = ReadCSVNGSFilter(newReader) + } else { + ngsfilter, err = ReadOldNGSFilter(newReader) } - return ReadOldNGSFilter(newReader) + if err != nil { + return nil, err + } + + ngsfilter.CheckPrimerUnicity() + ngsfilter.CheckTagLength() + + return ngsfilter, nil } + func ReadOldNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { ngsfilter := obingslibrary.MakeNGSLibrary() @@ -196,7 +209,7 @@ func ReadOldNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { return nil, fmt.Errorf("line %d : invalid format", i+1) } - primers, tags, experiment, sample, partial, ok := _parseMainNGSFilter(split[0]) + primers, tags, experiment, sample, ok := _parseMainNGSFilter(split[0]) if !ok { return nil, fmt.Errorf("line %d : invalid format : \n%s", i+1, line) @@ -213,7 +226,6 @@ func ReadOldNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { pcr.Experiment = experiment pcr.Sample = sample - pcr.Partial = partial if len(split) > 1 && len(split[1]) > 0 { pcr.Annotations = make(obiseq.Annotation) @@ -288,9 +300,11 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value case 0: log.Fatalln("Missing value for @tag_delimiter parameter") case 1: - library.SetTagDelimiter([]byte(values[0])[0]) + value := []byte(values[0])[0] + library.SetTagDelimiter(value) case 2: - library.SetTagDelimiterFor(values[0], []byte(values[1])[0]) + value := []byte(values[1])[0] + library.SetTagDelimiterFor(values[0], value) default: log.Fatalln("Invalid value for @tag_delimiter parameter") } @@ -300,7 +314,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value case 0: log.Fatalln("Missing value for @forward_tag_delimiter parameter") case 1: - library.SetForwardTagDelimiter([]byte(values[0])[0]) + value := []byte(values[0])[0] + library.SetForwardTagDelimiter(value) default: log.Fatalln("Invalid value for @forward_tag_delimiter parameter") } @@ -310,7 +325,8 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value case 0: log.Fatalln("Missing value for @reverse_tag_delimiter parameter") case 1: - library.SetReverseTagDelimiter([]byte(values[0])[0]) + value := []byte(values[0])[0] + library.SetReverseTagDelimiter(value) default: log.Fatalln("Invalid value for @reverse_tag_delimiter parameter") } @@ -338,21 +354,85 @@ var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, value log.Fatalf("Invalid value %s for @primer_error parameter", values[0]) } - library.SetAllowedMismatch(dist) + library.SetAllowedMismatches(dist) + case 2: + primer := values[0] + dist, err := strconv.Atoi(values[1]) + + if err != nil { + log.Fatalf("Invalid value %s for @primer_error parameter", values[1]) + } + + library.SetAllowedMismatchesFor(primer, dist) default: log.Fatalln("Invalid value for @primer_error parameter") } }, + "forward_mismatches": func(library *obingslibrary.NGSLibrary, values ...string) { + switch len(values) { + case 0: + log.Fatalln("Missing value for @forward_primer_error parameter") + case 1: + dist, err := strconv.Atoi(values[0]) + + if err != nil { + log.Fatalf("Invalid value %s for @forward_primer_error parameter", values[0]) + } + + library.SetForwardAllowedMismatches(dist) + default: + log.Fatalln("Invalid value for @forward_primer_error parameter") + } + }, + "reverse_mismatches": func(library *obingslibrary.NGSLibrary, values ...string) { + switch len(values) { + case 0: + log.Fatalln("Missing value for @reverse_primer_error parameter") + case 1: + dist, err := strconv.Atoi(values[0]) + + if err != nil { + log.Fatalf("Invalid value %s for @reverse_primer_error parameter", values[0]) + } + + library.SetReverseAllowedMismatches(dist) + default: + log.Fatalln("Invalid value for @reverse_primer_error parameter") + } + }, "indels": func(library *obingslibrary.NGSLibrary, values ...string) { switch len(values) { case 0: log.Fatalln("Missing value for @indels parameter") case 1: library.SetAllowsIndels(values[0] == "true") + case 2: + library.SetAllowsIndelsFor(values[0], values[1] == "true") default: log.Fatalln("Invalid value for @indels parameter") } }, + + "forward_indels": func(library *obingslibrary.NGSLibrary, values ...string) { + switch len(values) { + case 0: + log.Fatalln("Missing value for @forward_indels parameter") + case 1: + library.SetForwardAllowsIndels(values[0] == "true") + default: + log.Fatalln("Invalid value for @forward_indels parameter") + } + }, + "reverse_indels": func(library *obingslibrary.NGSLibrary, values ...string) { + switch len(values) { + case 0: + log.Fatalln("Missing value for @reverse_indels parameter") + case 1: + library.SetReverseAllowsIndels(values[0] == "true") + default: + log.Fatalln("Invalid value for @reverse_indels parameter") + } + }, } func ReadCSVNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { @@ -451,7 +531,6 @@ func ReadCSVNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { pcr.Experiment = fields[experimentColIndex] pcr.Sample = fields[sampleColIndex] - pcr.Partial = false if extraColumns != nil { pcr.Annotations = make(obiseq.Annotation) diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go index 157bb12..7dc8140 100644 --- a/pkg/obiformats/universal_read.go +++ b/pkg/obiformats/universal_read.go @@ -43,7 +43,7 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) { } fastqDetector := func(raw []byte, limit uint32) bool { - ok, err := regexp.Match("^@[^ ]", raw) + ok, err := regexp.Match("^@[^ ].*\n[^ ]+\n\\+", raw) return ok && err == nil } diff --git a/pkg/obingslibrary/marker.go b/pkg/obingslibrary/marker.go index 5b04461..0518828 100644 --- a/pkg/obingslibrary/marker.go +++ b/pkg/obingslibrary/marker.go @@ -2,6 +2,7 @@ package obingslibrary import ( "fmt" + "log" "strings" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" @@ -18,6 +19,12 @@ type Marker struct { Reverse_tag_length int Forward_spacer int Reverse_spacer int + Forward_error int + Reverse_error int + Forward_allows_indels bool + Reverse_allows_indels bool + Forward_matching string + Reverse_matching string Forward_tag_delimiter byte Reverse_tag_delimiter byte samples map[TagPair]*PCR @@ -48,6 +55,38 @@ func (marker *Marker) Compile(forward, reverse string, maxError int, allowsIndel return nil } +func (marker *Marker) Compile2(forward, reverse string) error { + var err error + + marker.CheckTagLength() + + marker.forward, err = obiapat.MakeApatPattern( + forward, + marker.Forward_error, + marker.Forward_allows_indels) + if err != nil { + return err + } + marker.reverse, err = obiapat.MakeApatPattern( + reverse, + marker.Reverse_error, + marker.Reverse_allows_indels) + if err != nil { + return err + } + + marker.cforward, err = marker.forward.ReverseComplement() + if err != nil { + return err + } + marker.creverse, err = marker.reverse.ReverseComplement() + if err != nil { + return err + } + + return nil +} + // Match finds the best matching demultiplex for a given sequence. // // Parameters: @@ -200,7 +239,8 @@ func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { } func (marker *Marker) GetPCR(forward, reverse string) (*PCR, bool) { - pair := TagPair{forward, reverse} + pair := TagPair{strings.ToLower(forward), + strings.ToLower(reverse)} pcr, ok := marker.samples[pair] if ok { @@ -234,7 +274,7 @@ func (marker *Marker) CheckTagLength() error { others = append(others, k) } } - return fmt.Errorf("forward tag length %d is not the same for all the PCRs : %v\n", maxfl, others) + return fmt.Errorf("forward tag length %d is not the same for all the PCRs : %v", maxfl, others) } maxrl, _ := obiutils.MaxMap(reverse_length) @@ -246,7 +286,7 @@ func (marker *Marker) CheckTagLength() error { others = append(others, k) } } - return fmt.Errorf("reverse tag length %d is not the same for all the PCRs : %v\n", maxrl, others) + return fmt.Errorf("reverse tag length %d is not the same for all the PCRs : %v", maxrl, others) } marker.Forward_tag_length = maxfl @@ -267,15 +307,90 @@ func (marker *Marker) SetTagSpacer(spacer int) { marker.SetReverseTagSpacer(spacer) } +func normalizeTagDelimiter(delim byte) byte { + if delim == '0' || delim == 0 { + delim = 0 + } else { + if delim >= 'A' && delim <= 'Z' { + delim = delim + 'A' - 'a' + } + if delim != 'a' && delim != 'c' && delim != 'g' && delim != 't' { + log.Fatalf("invalid reverse tag delimiter: %c, only 'a', 'c', 'g', 't' and '0' are allowed", delim) + } + } + + return delim +} + func (marker *Marker) SetForwardTagDelimiter(delim byte) { - marker.Forward_tag_delimiter = delim + marker.Forward_tag_delimiter = normalizeTagDelimiter(delim) } func (marker *Marker) SetReverseTagDelimiter(delim byte) { - marker.Reverse_tag_delimiter = delim + marker.Reverse_tag_delimiter = normalizeTagDelimiter(delim) } func (marker *Marker) SetTagDelimiter(delim byte) { marker.SetForwardTagDelimiter(delim) marker.SetReverseTagDelimiter(delim) } + +func (marker *Marker) SetForwardAllowedMismatches(allowed_mismatches int) { + marker.Forward_error = allowed_mismatches +} + +func (marker *Marker) SetReverseAllowedMismatches(allowed_mismatches int) { + marker.Reverse_error = allowed_mismatches +} + +func (marker *Marker) SetAllowedMismatch(allowed_mismatches int) { + marker.SetForwardAllowedMismatches(allowed_mismatches) + marker.SetReverseAllowedMismatches(allowed_mismatches) +} + +func (marker *Marker) SetForwardAllowsIndels(allows_indel bool) { + marker.Forward_allows_indels = allows_indel +} + +func (marker *Marker) SetReverseAllowsIndels(allows_indel bool) { + marker.Reverse_allows_indels = allows_indel +} + +func (marker *Marker) SetAllowsIndel(allows_indel bool) { + marker.SetForwardAllowsIndels(allows_indel) + marker.SetReverseAllowsIndels(allows_indel) +} + +func (marker *Marker) SetForwardMatching(matching string) error { + switch matching { + case "strict", "hamming", "indel": // Valid matching strategies + marker.Forward_matching = matching + + default: + return fmt.Errorf("invalid matching : %s", matching) + } + + return nil +} + +func (marker *Marker) SetReverseMatching(matching string) error { + switch matching { + case "strict", "hamming", "indel": // Valid matching strategies + marker.Reverse_matching = matching + + default: + return fmt.Errorf("invalid matching : %s", matching) + } + + return nil +} + +func (marker *Marker) SetMatching(matching string) error { + if err := marker.SetForwardMatching(matching); err != nil { + return err + } + if err := marker.SetReverseMatching(matching); err != nil { + return err + } + return nil +} diff --git a/pkg/obingslibrary/match.go b/pkg/obingslibrary/match.go index 75f2353..ab0c405 100644 --- a/pkg/obingslibrary/match.go +++ b/pkg/obingslibrary/match.go @@ -3,7 +3,6 @@ package obingslibrary import ( "errors" "fmt" - "strings" log "github.com/sirupsen/logrus" @@ -26,24 +25,12 @@ type DemultiplexMatch struct { Error error } -func (library *NGSLibrary) Compile(maxError int, allowsIndel bool) error { - for primers, marker := range library.Markers { - err := marker.Compile(primers.Forward, - primers.Reverse, - maxError, allowsIndel) - if err != nil { - return err - } - } - return nil -} - func (library *NGSLibrary) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { for primers, marker := range library.Markers { m := marker.Match(sequence) if m != nil { - m.ForwardPrimer = strings.ToLower(primers.Forward) - m.ReversePrimer = strings.ToLower(primers.Reverse) + m.ForwardPrimer = primers.Forward + m.ReversePrimer = primers.Reverse return m } } diff --git a/pkg/obingslibrary/multimatch.go b/pkg/obingslibrary/multimatch.go index 8fa1fbd..f7cf1c0 100644 --- a/pkg/obingslibrary/multimatch.go +++ b/pkg/obingslibrary/multimatch.go @@ -2,6 +2,7 @@ package obingslibrary import ( "fmt" + "math" "sort" log "github.com/sirupsen/logrus" @@ -23,7 +24,232 @@ type TagMatcher func( sequence *obiseq.BioSequence, begin, end int, forward bool) (TagPair, error) -func (library *NGSLibrary) TagExtractorFixedLength( +func Hamming(a, b string) int { + + if len(a) != len(b) { + return max(len(a), len(b)) + } + + count := int(0) + + for i := 0; i < len(a); i++ { + if a[i] != b[i] { + count++ + } + } + + return count +} + +func Levenshtein(s1, s2 string) int { + lenS1, lenS2 := len(s1), len(s2) + if lenS1 == 0 { + return lenS2 + } + if lenS2 == 0 { + return lenS1 + } + + // Create two slices to store the distances + prev := make([]int, lenS2+1) + curr := make([]int, lenS2+1) + + // Initialize the previous row of the matrix + for j := 0; j <= lenS2; j++ { + prev[j] = j + } + + // Iterate over each character in s1 + for i := 1; i <= lenS1; i++ { + curr[0] = i + + // Iterate over each character in s2 + for j := 1; j <= lenS2; j++ { + cost := 0 + if s1[i-1] != s2[j-1] { + cost = 1 + } + + // Calculate the minimum cost for the current cell + curr[j] = min(prev[j]+1, + curr[j-1]+1, // Insertion + prev[j-1]+cost) // Substitution + } + // Copy current row to previous row for the next iteration + prev, curr = curr, prev + } + + // The last value in the previous row is the Levenshtein distance + return prev[lenS2] +} + +func lookForTag(seq string, delimiter byte) string { + + i := len(seq) - 1 + + for i >= 0 && seq[i] != delimiter { + i-- + } + + for i >= 0 && seq[i] == delimiter { + i-- + } + + end := i + 1 + + for i >= 0 && seq[i] != delimiter { + i-- + } + + begin := i + 1 + + if i < 0 { + return "" + } + + return seq[begin:end] +} + +func (marker *Marker) beginDelimitedTagExtractor( + sequence *obiseq.BioSequence, + begin int, + forward bool) string { + + taglength := marker.Forward_spacer + marker.Forward_tag_length + delimiter := marker.Forward_tag_delimiter + + if !forward { + taglength = marker.Reverse_spacer + marker.Reverse_tag_length + delimiter = marker.Reverse_tag_delimiter + } + + fb := begin - taglength*2 + if fb < 0 { + fb = 0 + } + + return lookForTag(sequence.String()[fb:begin], delimiter) +} + +func (marker *Marker) beginFixedTagExtractor( + sequence *obiseq.BioSequence, + begin int, + forward bool) string { + + taglength := marker.Forward_tag_length + spacer := marker.Forward_spacer + + if !forward { + taglength = marker.Reverse_tag_length + spacer = marker.Reverse_spacer + } + + fb := begin - spacer - taglength + if fb < 0 { + log.Warnf("begin %v", fb) + return "" + } + + return sequence.String()[fb : begin-spacer] +} + +func (marker *Marker) endDelimitedTagExtractor( + sequence *obiseq.BioSequence, + end int, + forward bool) string { + + taglength := marker.Reverse_spacer + marker.Reverse_tag_length + delimiter := marker.Reverse_tag_delimiter + + if !forward { + taglength = marker.Forward_spacer + marker.Forward_tag_length + delimiter = marker.Forward_tag_delimiter + } + + fb := end + taglength*2 + if fb > sequence.Len() { + fb = sequence.Len() + } + + if end >= fb { + return "" + } + + tag_seq, err := sequence.Subsequence(end, fb, false) + + if err != nil { + log.Fatalf("Cannot extract sequence tag : %v", err) + } + + return lookForTag(tag_seq.ReverseComplement(true).String(), delimiter) +} + +func (marker *Marker) endFixedTagExtractor( + sequence *obiseq.BioSequence, + end int, + forward bool) string { + + taglength := marker.Reverse_tag_length + spacer := marker.Reverse_spacer + + if !forward { + taglength = marker.Forward_tag_length + spacer = marker.Forward_spacer + } + + fe := end + spacer + taglength + if fe > sequence.Len() { + return "" + } + + tag_seq, err := sequence.Subsequence(end+spacer, fe, false) + + if err != nil { + log.Fatalf("Cannot extract sequence tag : %v", err) + } + + return tag_seq.ReverseComplement(true).String() +} + +func (marker *Marker) beginTagExtractor( + sequence *obiseq.BioSequence, + begin int, + forward bool) string { + if forward { + if marker.Forward_tag_delimiter == 0 { + return marker.beginFixedTagExtractor(sequence, begin, forward) + } else { + return marker.beginDelimitedTagExtractor(sequence, begin, forward) + } + } else { + if marker.Reverse_tag_delimiter == 0 { + return marker.beginFixedTagExtractor(sequence, begin, forward) + } else { + return marker.beginDelimitedTagExtractor(sequence, begin, forward) + } + } +} + +func (marker *Marker) endTagExtractor( + sequence *obiseq.BioSequence, + end int, + forward bool) string { + if forward { + if marker.Reverse_tag_delimiter == 0 { + return marker.endFixedTagExtractor(sequence, end, forward) + } else { + return marker.endDelimitedTagExtractor(sequence, end, forward) + } + } else { + if marker.Forward_tag_delimiter == 0 { + return marker.endFixedTagExtractor(sequence, end, forward) + } else { + return marker.endDelimitedTagExtractor(sequence, end, forward) + } + } +} + +func (library *NGSLibrary) TagExtractor( sequence *obiseq.BioSequence, annotations obiseq.Annotation, primers PrimerPair, @@ -32,81 +258,140 @@ func (library *NGSLibrary) TagExtractorFixedLength( marker, ok := library.Markers[primers] - forward_tag := "" - reverse_tag := "" - if !ok { log.Fatalf("marker not found : %v", primers) } - fb := 0 + forward_tag := marker.beginTagExtractor(sequence, begin, forward) + reverse_tag := marker.endTagExtractor(sequence, end, forward) - if forward { - annotations["direction"] = "direct" - fb = begin - marker.Forward_spacer - marker.Forward_tag_length - if fb < 0 { - annotations["demultiplex_error"] = "Cannot extract forward tag" - return nil - } - forward_tag = sequence.String()[fb:(fb + marker.Forward_tag_length)] - - fb = end + marker.Reverse_spacer - if (fb + marker.Reverse_tag_length) > sequence.Len() { - annotations["demultiplex_error"] = "Cannot extract reverse tag" - return nil - } - - seq_tag, err := sequence.Subsequence(fb, fb+marker.Forward_tag_length, false) - - if err != nil { - annotations["demultiplex_error"] = "Cannot extract forward tag" - return nil - } - - reverse_tag = seq_tag.ReverseComplement(true).String() - - } else { - annotations["direction"] = "reverse" - fb = end + marker.Forward_spacer - if (fb + marker.Forward_tag_length) > sequence.Len() { - annotations["demultiplex_error"] = "Cannot extract forward tag" - return nil - } - - seq_tag, err := sequence.Subsequence(fb, fb+marker.Forward_tag_length, false) - - if err != nil { - annotations["demultiplex_error"] = "Cannot extract forward tag" - return nil - } - - forward_tag = seq_tag.ReverseComplement(true).String() - - fb = begin - marker.Reverse_spacer - marker.Reverse_tag_length - if fb < 0 { - sequence.SetAttribute("demultiplex_error", "Cannot extract reverse tag") - return nil - } - reverse_tag = sequence.String()[fb:(fb + marker.Reverse_tag_length)] + if forward_tag != "" { + annotations["forward_tag"] = forward_tag } - annotations["forward_tag"] = forward_tag - annotations["reverse_tag"] = reverse_tag - - return &TagPair{ - Forward: forward_tag, - Reverse: reverse_tag, + if reverse_tag != "" { + annotations["reverse_tag"] = reverse_tag } + + return &TagPair{forward_tag, reverse_tag} } -func (library *NGSLibrary) StrictSampleIdentifier(primerseqs PrimerPair, tags *TagPair, annotations obiseq.Annotation) *PCR { - marker := library.Markers[primerseqs] - pcr, ok := marker.samples[*tags] +func (marker *Marker) ClosestForwardTag( + tag string, + dist func(string, string) int, +) (string, int) { + mindist := math.MaxInt + mintag := "" + + for ts := range marker.samples { + d := dist(ts.Forward, tag) + + if d == mindist { + mintag = "" + } + + if d < mindist { + mindist = d + mintag = ts.Forward + } + } + + return mintag, mindist +} + +func (marker *Marker) ClosestReverseTag( + tag string, + dist func(string, string) int, +) (string, int) { + mindist := math.MaxInt + mintag := "" + + for ts := range marker.samples { + d := dist(ts.Reverse, tag) + + if d == mindist { + mintag = "" + } + + if d < mindist { + mindist = d + mintag = ts.Reverse + } + } + + return mintag, mindist +} + +func (library *NGSLibrary) SampleIdentifier( + primerseqs PrimerPair, + tags *TagPair, + annotations obiseq.Annotation) *PCR { + + marker, ok := library.Markers[primerseqs] if !ok { + log.Fatalf("marker not found : %v", primerseqs) + } + + forward := "" + reverse := "" + + fdistance := int(0) + rdistance := int(0) + + if tags.Forward != "" { + switch marker.Forward_matching { + case "strict": + forward = tags.Forward + fdistance = 0 + annotations["obimultiplex_forward_matching"] = "strict" + case "hamming": + forward, fdistance = marker.ClosestForwardTag(tags.Forward, Hamming) + annotations["obimultiplex_forward_matching"] = "hamming" + annotations["obimultiplex_forward_tag_dist"] = fdistance + annotations["obimultiplex_forward_proposed_tag"] = forward + case "indel": + forward, fdistance = marker.ClosestForwardTag(tags.Forward, Levenshtein) + annotations["obimultiplex_forward_matching"] = "indel" + annotations["obimultiplex_forward_tag_dist"] = fdistance + annotations["obimultiplex_forward_proposed_tag"] = forward + } + } + + if tags.Reverse != "" { + switch marker.Reverse_matching { + case "strict": + reverse = tags.Reverse + rdistance = 0 + annotations["obimultiplex_reverse_matching"] = "strict" + case "hamming": + reverse, rdistance = marker.ClosestReverseTag(tags.Reverse, Hamming) + annotations["obimultiplex_reverse_matching"] = "hamming" + annotations["obimultiplex_reverse_tag_dist"] = rdistance + annotations["obimultiplex_reverse_proposed_tag"] = reverse + case "indel": + reverse, rdistance = marker.ClosestReverseTag(tags.Reverse, Levenshtein) + annotations["obimultiplex_reverse_matching"] = "indel" + annotations["obimultiplex_reverse_tag_dist"] = rdistance + annotations["obimultiplex_reverse_proposed_tag"] = reverse + } + } + + proposed := TagPair{forward, reverse} + + pcr, ok := marker.samples[proposed] + + if !ok { + annotations["demultiplex_error"] = fmt.Sprintf("Cannot associate sample to the tag pair (%s:%s)", forward, reverse) return nil } + annotations["sample"] = pcr.Sample + annotations["experiment"] = pcr.Experiment + for k, v := range pcr.Annotations { + annotations[k] = v + } + return pcr } @@ -202,75 +487,84 @@ func (library *NGSLibrary) ExtractMultiBarcode(sequence *obiseq.BioSequence) (ob case 1: if match.Marker == -from.Marker && match.Forward == from.Forward { - + barcode_error := false annotations := obiseq.GetAnnotation() annotations["forward_primer"] = primerseqs[from.Marker].Forward annotations["reverse_primer"] = primerseqs[from.Marker].Reverse if from.Forward { + if from.Begin < 0 || from.End > sequence.Len() { + barcode_error = true + annotations["multiplex_error"] = "Cannot extract forward match" + } else { + annotations["forward_match"] = sequence.String()[from.Begin:from.End] + } - annotations["forward_match"] = sequence.String()[from.Begin:from.End] sseq, err := sequence.Subsequence(match.Begin, match.End, false) if err != nil { + barcode_error = true annotations["multiplex_error"] = "Cannot extract reverse match" + } else { + annotations["reverse_match"] = sseq.ReverseComplement(true).String() } - annotations["reverse_match"] = sseq.ReverseComplement(true).String() annotations["forward_error"] = from.Mismatches annotations["reverse_error"] = match.Mismatches } else { - annotations["reverse_match"] = sequence.String()[from.Begin:from.End] + if from.Begin < 0 || from.End > sequence.Len() { + barcode_error = true + annotations["multiplex_error"] = "Cannot extract reverse match" + } else { + annotations["reverse_match"] = sequence.String()[from.Begin:from.End] + } + sseq, err := sequence.Subsequence(match.Begin, match.End, false) if err != nil { + barcode_error = true annotations["multiplex_error"] = "Cannot extract forward match" + } else { + annotations["forward_match"] = sseq.ReverseComplement(true).String() } - annotations["forward_match"] = sseq.ReverseComplement(true).String() - annotations["reverse_error"] = from.Mismatches annotations["forward_error"] = match.Mismatches } - tags := library.TagExtractorFixedLength(sequence, annotations, primerseqs[from.Marker], from.Begin, match.End, from.Forward) + if !barcode_error { + tags := library.TagExtractor(sequence, annotations, primerseqs[from.Marker], from.Begin, match.End, from.Forward) - barcode, err := sequence.Subsequence(from.End, match.Begin, false) + barcode, err := sequence.Subsequence(from.End, match.Begin, false) - if err != nil { - return nil, fmt.Errorf("%s [%s] : Cannot extract barcode %d : %v", sequence.Id(), sequence.Source(), q, err) - } + if err != nil { + return nil, fmt.Errorf("%s [%s] : Cannot extract barcode %d : %v", sequence.Id(), sequence.Source(), q, err) + } - if !match.Forward { - barcode = barcode.ReverseComplement(true) - } + if !match.Forward { + barcode = barcode.ReverseComplement(true) + } - if tags != nil { - pcr := library.StrictSampleIdentifier(primerseqs[from.Marker], tags, annotations) + if tags != nil { + library.SampleIdentifier(primerseqs[from.Marker], tags, annotations) + } - if pcr == nil { - annotations["demultiplex_error"] = "Cannot associate sample to the tag pair" - } else { - annotations["sample"] = pcr.Sample - annotations["experiment"] = pcr.Experiment - for k, v := range pcr.Annotations { - annotations[k] = v - } + barcode.AnnotationsLock() + obiutils.MustFillMap(barcode.Annotations(), annotations) + barcode.AnnotationsUnlock() + + if barcode.Len() > 0 { + results = append(results, barcode) + q++ } } - barcode.AnnotationsLock() - obiutils.MustFillMap(barcode.Annotations(), annotations) - barcode.AnnotationsUnlock() - - results = append(results, barcode) state = 0 - q++ } else if match.Marker > 0 { - log.Warnf("Marker mismatch : %d %d", match.Marker, from.Marker) + log.Debugf("Marker mismatch : %d %d", match.Marker, from.Marker) from = match } else { - log.Warnf("Marker mismatch : %d %d", match.Marker, from.Marker) + log.Debugf("Marker mismatch : %d %d", match.Marker, from.Marker) state = 0 } } @@ -292,7 +586,15 @@ func (library *NGSLibrary) ExtractMultiBarcode(sequence *obiseq.BioSequence) (ob func (library *NGSLibrary) ExtractMultiBarcodeSliceWorker(options ...WithOption) obiseq.SeqSliceWorker { opt := MakeOptions(options) - library.Compile(opt.AllowedMismatch(), opt.AllowsIndel()) + if opt.AllowsIndels() { + library.SetAllowsIndels(true) + } + + if opt.AllowedMismatches() > 0 { + library.SetAllowedMismatches(opt.AllowedMismatches()) + } + + library.Compile2() worker := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { return library.ExtractMultiBarcode(sequence) diff --git a/pkg/obingslibrary/ngslibrary.go b/pkg/obingslibrary/ngslibrary.go index a84d25a..9f30de0 100644 --- a/pkg/obingslibrary/ngslibrary.go +++ b/pkg/obingslibrary/ngslibrary.go @@ -2,6 +2,7 @@ package obingslibrary import ( "fmt" + "strings" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" ) @@ -19,30 +20,24 @@ type TagPair struct { type PCR struct { Experiment string Sample string - Partial bool Annotations obiseq.Annotation } type NGSLibrary struct { - Matching string - Allowed_mismatches int - Allows_indels bool - Primers map[string]PrimerPair - Markers map[PrimerPair]*Marker + Primers map[string]PrimerPair + Markers map[PrimerPair]*Marker } func MakeNGSLibrary() NGSLibrary { return NGSLibrary{ - Matching: "strict", - Allowed_mismatches: 2, - Allows_indels: false, - Primers: make(map[string]PrimerPair, 10), - Markers: make(map[PrimerPair]*Marker, 10), + Primers: make(map[string]PrimerPair, 10), + Markers: make(map[PrimerPair]*Marker, 10), } } func (library *NGSLibrary) GetMarker(forward, reverse string) (*Marker, bool) { - pair := PrimerPair{forward, reverse} + pair := PrimerPair{strings.ToLower(forward), + strings.ToLower(reverse)} marker, ok := (library.Markers)[pair] if ok { @@ -56,6 +51,12 @@ func (library *NGSLibrary) GetMarker(forward, reverse string) (*Marker, bool) { Reverse_spacer: 0, Forward_tag_delimiter: 0, Reverse_tag_delimiter: 0, + Forward_error: 2, + Reverse_error: 2, + Forward_matching: "strict", + Reverse_matching: "strict", + Forward_allows_indels: false, + Reverse_allows_indels: false, samples: make(map[TagPair]*PCR, 1000), } @@ -64,6 +65,29 @@ func (library *NGSLibrary) GetMarker(forward, reverse string) (*Marker, bool) { return &m, false } +func (library *NGSLibrary) Compile(maxError int, allowsIndel bool) error { + for primers, marker := range library.Markers { + err := marker.Compile(primers.Forward, + primers.Reverse, + maxError, allowsIndel) + if err != nil { + return err + } + } + return nil +} + +func (library *NGSLibrary) Compile2() error { + for primers, marker := range library.Markers { + err := marker.Compile2(primers.Forward, + primers.Reverse) + if err != nil { + return err + } + } + return nil +} + func (library *NGSLibrary) SetForwardTagSpacer(spacer int) { for _, marker := range library.Markers { marker.SetForwardTagSpacer(spacer) @@ -82,6 +106,7 @@ func (library *NGSLibrary) SetTagSpacer(spacer int) { } func (library *NGSLibrary) SetTagSpacerFor(primer string, spacer int) { + primer = strings.ToLower(primer) primers, ok := library.Primers[primer] if ok { @@ -115,6 +140,7 @@ func (library *NGSLibrary) SetTagDelimiter(delim byte) { } func (library *NGSLibrary) SetTagDelimiterFor(primer string, delim byte) { + primer = strings.ToLower(primer) primers, ok := library.Primers[primer] if ok { @@ -154,27 +180,131 @@ func (library *NGSLibrary) CheckPrimerUnicity() error { return nil } -// SetMatching sets the matching strategy for the library. -// Returns an error if the matching strategy is invalid. -func (library *NGSLibrary) SetMatching(matching string) error { - switch matching { - case "strict", "hamming", "indel": // Valid matching strategies - library.Matching = matching - default: - return fmt.Errorf("invalid matching : %s", matching) +func (library *NGSLibrary) SetForwardMatching(matching string) error { + + for _, marker := range library.Markers { + err := marker.SetForwardMatching(matching) + if err != nil { + return err + } + } + + return nil +} + +func (library *NGSLibrary) SetReverseMatching(matching string) error { + for _, marker := range library.Markers { + err := marker.SetReverseMatching(matching) + if err != nil { + return err + } } return nil } -func (library *NGSLibrary) SetAllowedMismatch(allowed_mismatches int) { - if allowed_mismatches < 0 { - allowed_mismatches = 0 +func (library *NGSLibrary) SetMatching(matching string) error { + err := library.SetForwardMatching(matching) + + if err != nil { + return err } - library.Allowed_mismatches = allowed_mismatches + + err = library.SetReverseMatching(matching) + + return err +} + +func (library *NGSLibrary) SetMatchingFor(primer string, matching string) error { + primer = strings.ToLower(primer) + primers, ok := library.Primers[primer] + + if ok { + marker, ok := library.Markers[primers] + + if ok { + if primer == primers.Forward { + err := marker.SetForwardMatching(matching) + if err != nil { + return err + } + } else { + err := marker.SetReverseMatching(matching) + if err != nil { + return err + } + } + } + } + + return nil } // SetAllowsIndels sets whether the library allows indels. // The value of the argument allows_indels is directly assigned to the library's Allows_indels field. -func (library *NGSLibrary) SetAllowsIndels(allows_indels bool) { - library.Allows_indels = allows_indels +func (library *NGSLibrary) SetForwardAllowsIndels(allows_indels bool) { + for _, marker := range library.Markers { + marker.SetForwardAllowsIndels(allows_indels) + } +} + +func (library *NGSLibrary) SetReverseAllowsIndels(allows_indels bool) { + for _, marker := range library.Markers { + marker.SetReverseAllowsIndels(allows_indels) + } +} + +func (library *NGSLibrary) SetAllowsIndels(allows_indels bool) { + library.SetForwardAllowsIndels(allows_indels) + library.SetReverseAllowsIndels(allows_indels) +} + +func (library *NGSLibrary) SetAllowsIndelsFor(primer string, allows_indel bool) { + primer = strings.ToLower(primer) + primers, ok := library.Primers[primer] + + if ok { + marker, ok := library.Markers[primers] + + if ok { + if primer == primers.Forward { + marker.SetForwardAllowsIndels(allows_indel) + } else { + marker.SetReverseAllowsIndels(allows_indel) + } + } + } +} + +func (library *NGSLibrary) SetForwardAllowedMismatches(allowed_mismatches int) { + for _, marker := range library.Markers { + marker.SetForwardAllowedMismatches(allowed_mismatches) + } +} + +func (library *NGSLibrary) SetReverseAllowedMismatches(allowed_mismatches int) { + for _, marker := range library.Markers { + marker.SetReverseAllowedMismatches(allowed_mismatches) + } +} + +func (library *NGSLibrary) SetAllowedMismatches(allowed_mismatches int) { + library.SetForwardAllowedMismatches(allowed_mismatches) + library.SetReverseAllowedMismatches(allowed_mismatches) +} + +func (library *NGSLibrary) SetAllowedMismatchesFor(primer string, allowed_mismatches int) { + primer = strings.ToLower(primer) + primers, ok := library.Primers[primer] + + if ok { + marker, ok := library.Markers[primers] + + if ok { + if primer == primers.Forward { + marker.SetForwardAllowedMismatches(allowed_mismatches) + } else { + marker.SetReverseAllowedMismatches(allowed_mismatches) + } + } + } } diff --git a/pkg/obingslibrary/worker.go b/pkg/obingslibrary/worker.go index 91e0e01..39a9993 100644 --- a/pkg/obingslibrary/worker.go +++ b/pkg/obingslibrary/worker.go @@ -93,11 +93,11 @@ func (options Options) Unidentified() string { return options.pointer.unidentified } -func (options Options) AllowedMismatch() int { +func (options Options) AllowedMismatches() int { return options.pointer.allowedMismatch } -func (options Options) AllowsIndel() bool { +func (options Options) AllowsIndels() bool { return options.pointer.allowsIndel } @@ -160,7 +160,7 @@ func ExtractBarcodeSlice(ngslibrary *NGSLibrary, opt := MakeOptions(options) - ngslibrary.Compile(opt.AllowedMismatch(), opt.AllowsIndel()) + ngslibrary.Compile(opt.AllowedMismatches(), opt.AllowsIndels()) return _ExtractBarcodeSlice(ngslibrary, sequences, opt) } @@ -170,7 +170,7 @@ func ExtractBarcodeSliceWorker(ngslibrary *NGSLibrary, opt := MakeOptions(options) - ngslibrary.Compile(opt.AllowedMismatch(), opt.AllowsIndel()) + ngslibrary.Compile(opt.AllowedMismatches(), opt.AllowsIndels()) worker := func(sequences obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) { return _ExtractBarcodeSlice(ngslibrary, sequences, opt), nil diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 0323984..869f631 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "0e9a136" +var _Commit = "d600713" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obitools/obimultiplex2/options.go b/pkg/obitools/obimultiplex2/options.go index afa56d9..6fa2054 100644 --- a/pkg/obitools/obimultiplex2/options.go +++ b/pkg/obitools/obimultiplex2/options.go @@ -15,7 +15,7 @@ import ( var _NGSFilterFile = "" var _askTemplate = false var _UnidentifiedFile = "" -var _AllowedMismatch = int(2) +var _AllowedMismatch = -1 var _AllowsIndel = false var _ConservedError = false