From 65f5109957970748239f9f5e0034da961b54bebf Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 4 Jun 2024 16:49:12 +0200 Subject: [PATCH] Plenty of small bugs Former-commit-id: 42c7fab7d65906c80ab4cd32da6867ff21842ea8 --- pkg/obiformats/fastseq_write_fastq.go | 8 +- pkg/obiformats/ngsfilter_read.go | 124 +++++++++- pkg/obiiter/batchiterator.go | 19 +- pkg/obingslibrary/marker.go | 281 ++++++++++++++++++++++ pkg/obingslibrary/match.go | 202 +--------------- pkg/obingslibrary/multimatch.go | 186 ++++++++++++++ pkg/obingslibrary/ngslibrary.go | 113 +++++++-- pkg/obingslibrary/worker.go | 6 +- pkg/obioptions/version.go | 2 +- pkg/obiseq/biosequence.go | 15 +- pkg/obitools/obimultiplex/options.go | 2 +- pkg/obitools/obimultiplex2/demultiplex.go | 60 +++++ pkg/obitools/obimultiplex2/options.go | 108 +++++++++ pkg/obitools/obitagpcr/pcrtag.go | 14 +- pkg/obiutils/minmax.go | 18 ++ 15 files changed, 894 insertions(+), 264 deletions(-) create mode 100644 pkg/obingslibrary/marker.go create mode 100644 pkg/obingslibrary/multimatch.go create mode 100644 pkg/obitools/obimultiplex2/demultiplex.go create mode 100644 pkg/obitools/obimultiplex2/options.go diff --git a/pkg/obiformats/fastseq_write_fastq.go b/pkg/obiformats/fastseq_write_fastq.go index 0dd570e..7d1bea7 100644 --- a/pkg/obiformats/fastseq_write_fastq.go +++ b/pkg/obiformats/fastseq_write_fastq.go @@ -26,11 +26,13 @@ func FormatFastq(seq *obiseq.BioSequence, formater FormatHeader) string { info = formater(seq) } - return fmt.Sprintf("@%s %s\n%s\n+\n%s", + f := fmt.Sprintf("@%s %s\n%s\n+\n%s", seq.Id(), info, seq.String(), q, ) + + return f } func FormatFastqBatch(batch obiiter.BioSequenceBatch, @@ -38,7 +40,8 @@ func FormatFastqBatch(batch obiiter.BioSequenceBatch, var bs bytes.Buffer for _, seq := range batch.Slice() { if seq.Len() > 0 { - bs.WriteString(FormatFastq(seq, formater)) + fs := FormatFastq(seq, formater) + bs.WriteString(fs) bs.WriteString("\n") } else { if skipEmpty { @@ -49,6 +52,7 @@ func FormatFastqBatch(batch obiiter.BioSequenceBatch, } } + return bs.Bytes() } diff --git a/pkg/obiformats/ngsfilter_read.go b/pkg/obiformats/ngsfilter_read.go index dd1627b..5f53f40 100644 --- a/pkg/obiformats/ngsfilter_read.go +++ b/pkg/obiformats/ngsfilter_read.go @@ -6,8 +6,11 @@ import ( "encoding/csv" "fmt" "io" + "strconv" "strings" + "errors" + log "github.com/sirupsen/logrus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary" @@ -85,6 +88,53 @@ func _parseMainNGSFilter(text string) (obingslibrary.PrimerPair, obingslibrary.T true } +func NGSFilterCsvDetector(raw []byte, limit uint32) bool { + r := csv.NewReader(bytes.NewReader(dropLastLine(raw, limit))) + r.Comma = ',' + r.ReuseRecord = true + r.LazyQuotes = true + r.FieldsPerRecord = -1 + r.Comment = '#' + + nfields := 0 + + lines := 0 + for { + rec, err := r.Read() + if len(rec) > 0 && rec[0] == "@param" { + continue + } + if errors.Is(err, io.EOF) { + break + } + if err != nil { + return false + } + + if nfields == 0 { + nfields = len(rec) + } else if nfields != len(rec) { + return false + } + lines++ + } + + return nfields > 1 && lines > 1 + +} + +func dropLastLine(b []byte, readLimit uint32) []byte { + if readLimit == 0 || uint32(len(b)) < readLimit { + return b + } + for i := len(b) - 1; i > 0; i-- { + if b[i] == '\n' { + return b[:i] + } + } + return b +} + func OBIMimeNGSFilterTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) { // Create a buffer to store the read data @@ -95,6 +145,8 @@ func OBIMimeNGSFilterTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, e return nil, nil, err } + mimetype.Lookup("text/plain").Extend(NGSFilterCsvDetector, "text/ngsfilter-csv", ".csv") + // Detect the MIME type using the mimetype library mimeType := mimetype.Detect(buf[:n]) if mimeType == nil { @@ -111,7 +163,7 @@ func OBIMimeNGSFilterTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, e return mimeType, newReader, nil } -func ReadNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { +func ReadNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { mimetype, newReader, err := OBIMimeNGSFilterTypeGuesser(reader) if err != nil { @@ -120,13 +172,13 @@ func ReadNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { log.Infof("NGSFilter configuration mimetype: %s", mimetype.String()) - if mimetype.String() == "text/csv" { + if mimetype.String() == "text/ngsfilter-csv" { return ReadCSVNGSFilter(newReader) } return ReadOldNGSFilter(newReader) } -func ReadOldNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { +func ReadOldNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { ngsfilter := obingslibrary.MakeNGSLibrary() lines := _readLines(reader) @@ -154,7 +206,7 @@ func ReadOldNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { pcr, ok := marker.GetPCR(tags.Forward, tags.Reverse) if ok { - return ngsfilter, + return &ngsfilter, fmt.Errorf("line %d : tag pair (%s,%s) used more than once with marker (%s,%s)", i, tags.Forward, tags.Reverse, primers.Forward, primers.Reverse) } @@ -170,17 +222,47 @@ func ReadOldNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { } - return ngsfilter, nil + return &ngsfilter, nil } -func ReadCSVNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { +var library_parameter = map[string]func(library *obingslibrary.NGSLibrary, values ...string){ + "@spacer": func(library *obingslibrary.NGSLibrary, values ...string) { + switch len(values) { + case 0: + log.Fatalln("Missing value for @spacer parameter") + case 1: + spacer, err := strconv.Atoi(values[0]) + + if err != nil { + log.Fatalln("Invalid value for @spacer parameter") + } + + library.SetTagSpacer(spacer) + case 2: + primer := values[0] + spacer, err := strconv.Atoi(values[1]) + + if err != nil { + log.Fatalln("Invalid value for @spacer parameter") + } + + library.SetTagSpacerFor(primer, spacer) + default: + log.Fatalln("Invalid value for @spacer parameter") + } + }, +} + +func ReadCSVNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) { ngsfilter := obingslibrary.MakeNGSLibrary() file := csv.NewReader(reader) file.Comma = ',' - file.Comment = '#' - file.TrimLeadingSpace = true file.ReuseRecord = true + file.LazyQuotes = true + file.Comment = '#' + file.FieldsPerRecord = -1 + file.TrimLeadingSpace = true records, err := file.ReadAll() @@ -188,12 +270,30 @@ func ReadCSVNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { return nil, err } - log.Info("Read ", len(records), " records") - log.Infof("First record: %s", records[0]) + i := 0 + for i = 0; i < len(records) && records[i][0] == "@param"; i++ { + param := records[i][1] + if len(records[i]) < 3 { + log.Fatalf("At line %d: Missing value for parameter %s", i, param) + } + data := records[i][2:] + setparam, ok := library_parameter[param] + + if ok { + setparam(&ngsfilter, data...) + } else { + log.Warnf("At line %d: Skipping unknown parameter %s: %v", i, param, data) + } + } + + records = records[i:] header := records[0] data := records[1:] + log.Info("Read ", len(records), " records") + log.Infof("First record: %s", header) + // Find the index of the column named "sample" experimentColIndex := -1 sampleColIndex := -1 @@ -253,7 +353,7 @@ func ReadCSVNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { pcr, ok := marker.GetPCR(tags.Forward, tags.Reverse) if ok { - return ngsfilter, + return &ngsfilter, fmt.Errorf("line %d : tag pair (%s,%s) used more than once with marker (%s,%s)", i, tags.Forward, tags.Reverse, forward_primer, reverse_primer) } @@ -271,5 +371,5 @@ func ReadCSVNGSFilter(reader io.Reader) (obingslibrary.NGSLibrary, error) { } - return ngsfilter, nil + return &ngsfilter, nil } diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index 3ab8793..401ffce 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -510,14 +510,14 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate, trueIter := MakeIBioSequence() falseIter := MakeIBioSequence() + if iterator.IsPaired() { + trueIter.MarkAsPaired() + falseIter.MarkAsPaired() + } + trueIter.Add(1) falseIter.Add(1) - go func() { - trueIter.WaitAndClose() - falseIter.WaitAndClose() - }() - go func() { trueOrder := 0 falseOrder := 0 @@ -562,10 +562,11 @@ func (iterator IBioSequence) DivideOn(predicate obiseq.SequencePredicate, falseIter.Done() }() - if iterator.IsPaired() { - trueIter.MarkAsPaired() - falseIter.MarkAsPaired() - } + go func() { + trueIter.WaitAndClose() + falseIter.WaitAndClose() + }() + return trueIter, falseIter } diff --git a/pkg/obingslibrary/marker.go b/pkg/obingslibrary/marker.go new file mode 100644 index 0000000..5b04461 --- /dev/null +++ b/pkg/obingslibrary/marker.go @@ -0,0 +1,281 @@ +package obingslibrary + +import ( + "fmt" + "strings" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" +) + +type Marker struct { + forward obiapat.ApatPattern + cforward obiapat.ApatPattern + reverse obiapat.ApatPattern + creverse obiapat.ApatPattern + Forward_tag_length int + Reverse_tag_length int + Forward_spacer int + Reverse_spacer int + Forward_tag_delimiter byte + Reverse_tag_delimiter byte + samples map[TagPair]*PCR +} + +func (marker *Marker) Compile(forward, reverse string, maxError int, allowsIndel bool) error { + var err error + + marker.CheckTagLength() + + marker.forward, err = obiapat.MakeApatPattern(forward, maxError, allowsIndel) + if err != nil { + return err + } + marker.reverse, err = obiapat.MakeApatPattern(reverse, maxError, allowsIndel) + if err != nil { + return err + } + + marker.cforward, err = marker.forward.ReverseComplement() + if err != nil { + return err + } + marker.creverse, err = marker.reverse.ReverseComplement() + if err != nil { + return err + } + return nil +} + +// Match finds the best matching demultiplex for a given sequence. +// +// Parameters: +// +// marker - a pointer to a Marker struct that contains the forward and reverse primers. +// sequence - a pointer to a BioSequence struct that represents the input sequence. +// +// Returns: +// +// A pointer to a DemultiplexMatch struct that contains the best matching demultiplex. +func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { + aseq, _ := obiapat.MakeApatSequence(sequence, false) + + start, end, nerr, matched := marker.forward.BestMatch(aseq, marker.Forward_tag_length, -1) + + if matched { + if start < 0 { + start = 0 + } + + if end > sequence.Len() { + end = sequence.Len() + } + + sseq := sequence.String() + direct := sseq[start:end] + tagstart := max(start-marker.Forward_tag_length, 0) + ftag := strings.ToLower(sseq[tagstart:start]) + + m := DemultiplexMatch{ + ForwardMatch: direct, + ForwardTag: ftag, + BarcodeStart: end, + ForwardMismatches: nerr, + IsDirect: true, + Error: nil, + } + + start, end, nerr, matched = marker.creverse.BestMatch(aseq, start, -1) + + if matched { + + // extracting primer matches + reverse, _ := sequence.Subsequence(start, end, false) + defer reverse.Recycle() + reverse = reverse.ReverseComplement(true) + endtag := min(end+marker.Reverse_tag_length, sequence.Len()) + rtag, err := sequence.Subsequence(end, endtag, false) + defer rtag.Recycle() + srtag := "" + + if err != nil { + rtag = nil + } else { + rtag.ReverseComplement(true) + srtag = strings.ToLower(rtag.String()) + } + + m.ReverseMatch = strings.ToLower(reverse.String()) + m.ReverseMismatches = nerr + m.BarcodeEnd = start + m.ReverseTag = srtag + + sample, ok := marker.samples[TagPair{ftag, srtag}] + + if ok { + m.Pcr = sample + } + + return &m + + } + + m.Error = fmt.Errorf("cannot locates reverse priming site") + + return &m + } + + // + // At this point the forward primer didn't match. + // We try now with the reverse primer + // + + start, end, nerr, matched = marker.reverse.BestMatch(aseq, marker.Reverse_tag_length, -1) + + if matched { + if start < 0 { + start = 0 + } + + if end > sequence.Len() { + end = sequence.Len() + } + + sseq := sequence.String() + + reverse := strings.ToLower(sseq[start:end]) + tagstart := max(start-marker.Reverse_tag_length, 0) + rtag := strings.ToLower(sseq[tagstart:start]) + + m := DemultiplexMatch{ + ReverseMatch: reverse, + ReverseTag: rtag, + BarcodeStart: end, + ReverseMismatches: nerr, + IsDirect: false, + Error: nil, + } + + start, end, nerr, matched = marker.cforward.BestMatch(aseq, end, -1) + + if matched { + + direct, _ := sequence.Subsequence(start, end, false) + defer direct.Recycle() + direct = direct.ReverseComplement(true) + + endtag := min(end+marker.Forward_tag_length, sequence.Len()) + ftag, err := sequence.Subsequence(end, endtag, false) + defer ftag.Recycle() + sftag := "" + if err != nil { + ftag = nil + + } else { + ftag = ftag.ReverseComplement(true) + sftag = strings.ToLower(ftag.String()) + } + + m.ForwardMatch = direct.String() + m.ForwardTag = sftag + m.ForwardMismatches = nerr + m.BarcodeEnd = start + + sample, ok := marker.samples[TagPair{sftag, rtag}] + + if ok { + m.Pcr = sample + } + + return &m + } + + m.Error = fmt.Errorf("cannot locates forward priming site") + + return &m + } + + return nil +} + +func (marker *Marker) GetPCR(forward, reverse string) (*PCR, bool) { + pair := TagPair{forward, reverse} + pcr, ok := marker.samples[pair] + + if ok { + return pcr, ok + } + + ipcr := PCR{} + marker.samples[pair] = &ipcr + + return &ipcr, false +} + +func (marker *Marker) CheckTagLength() error { + forward_length := make(map[int]int) + reverse_length := make(map[int]int) + + marker.Forward_tag_length = -1 + marker.Reverse_tag_length = -1 + + for tags := range marker.samples { + forward_length[len(tags.Forward)]++ + reverse_length[len(tags.Reverse)]++ + } + + maxfl, _ := obiutils.MaxMap(forward_length) + + if len(forward_length) > 1 { + others := make([]int, 0) + for k := range forward_length { + if k != maxfl { + others = append(others, k) + } + } + return fmt.Errorf("forward tag length %d is not the same for all the PCRs : %v\n", maxfl, others) + } + + maxrl, _ := obiutils.MaxMap(reverse_length) + + if len(reverse_length) > 1 { + others := make([]int, 0) + for k := range reverse_length { + if k != maxrl { + others = append(others, k) + } + } + return fmt.Errorf("reverse tag length %d is not the same for all the PCRs : %v\n", maxrl, others) + } + + marker.Forward_tag_length = maxfl + marker.Reverse_tag_length = maxrl + return nil +} + +func (marker *Marker) SetForwardTagSpacer(spacer int) { + marker.Forward_spacer = spacer +} + +func (marker *Marker) SetReverseTagSpacer(spacer int) { + marker.Reverse_spacer = spacer +} + +func (marker *Marker) SetTagSpacer(spacer int) { + marker.SetForwardTagSpacer(spacer) + marker.SetReverseTagSpacer(spacer) +} + +func (marker *Marker) SetForwardTagDelimiter(delim byte) { + marker.Forward_tag_delimiter = delim +} + +func (marker *Marker) SetReverseTagDelimiter(delim byte) { + marker.Reverse_tag_delimiter = delim +} + +func (marker *Marker) SetTagDelimiter(delim byte) { + marker.SetForwardTagDelimiter(delim) + marker.SetReverseTagDelimiter(delim) +} diff --git a/pkg/obingslibrary/match.go b/pkg/obingslibrary/match.go index 68bd10e..75f2353 100644 --- a/pkg/obingslibrary/match.go +++ b/pkg/obingslibrary/match.go @@ -7,7 +7,6 @@ import ( log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" ) @@ -28,7 +27,7 @@ type DemultiplexMatch struct { } func (library *NGSLibrary) Compile(maxError int, allowsIndel bool) error { - for primers, marker := range *library { + for primers, marker := range library.Markers { err := marker.Compile(primers.Forward, primers.Reverse, maxError, allowsIndel) @@ -40,7 +39,7 @@ func (library *NGSLibrary) Compile(maxError int, allowsIndel bool) error { } func (library *NGSLibrary) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { - for primers, marker := range *library { + for primers, marker := range library.Markers { m := marker.Match(sequence) if m != nil { m.ForwardPrimer = strings.ToLower(primers.Forward) @@ -57,203 +56,6 @@ func (library *NGSLibrary) ExtractBarcode(sequence *obiseq.BioSequence, inplace return match.ExtractBarcode(sequence, inplace) } -func (marker *Marker) Compile(forward, reverse string, maxError int, allowsIndel bool) error { - var err error - marker.forward, err = obiapat.MakeApatPattern(forward, maxError, allowsIndel) - if err != nil { - return err - } - marker.reverse, err = obiapat.MakeApatPattern(reverse, maxError, allowsIndel) - if err != nil { - return err - } - - marker.cforward, err = marker.forward.ReverseComplement() - if err != nil { - return err - } - marker.creverse, err = marker.reverse.ReverseComplement() - if err != nil { - return err - } - - marker.taglength = 0 - for tags := range marker.samples { - lf := len(tags.Forward) - lr := len(tags.Reverse) - - l := lf - if lf == 0 { - l = lr - } - - if lr != 0 && l != lr { - return fmt.Errorf("forward tag (%s) and reverse tag (%s) do not have the same length", - tags.Forward, tags.Reverse) - } - - if marker.taglength != 0 && l != marker.taglength { - return fmt.Errorf("tag pair (%s,%s) is not compatible with a tag length of %d", - tags.Forward, tags.Reverse, marker.taglength) - } else { - marker.taglength = l - } - } - - return nil -} - -// Match finds the best matching demultiplex for a given sequence. -// -// Parameters: -// -// marker - a pointer to a Marker struct that contains the forward and reverse primers. -// sequence - a pointer to a BioSequence struct that represents the input sequence. -// -// Returns: -// -// A pointer to a DemultiplexMatch struct that contains the best matching demultiplex. -func (marker *Marker) Match(sequence *obiseq.BioSequence) *DemultiplexMatch { - aseq, _ := obiapat.MakeApatSequence(sequence, false) - - start, end, nerr, matched := marker.forward.BestMatch(aseq, marker.taglength, -1) - - if matched { - if start < 0 { - start = 0 - } - - if end > sequence.Len() { - end = sequence.Len() - } - - sseq := sequence.String() - direct := sseq[start:end] - tagstart := max(start-marker.taglength, 0) - ftag := strings.ToLower(sseq[tagstart:start]) - - m := DemultiplexMatch{ - ForwardMatch: direct, - ForwardTag: ftag, - BarcodeStart: end, - ForwardMismatches: nerr, - IsDirect: true, - Error: nil, - } - - start, end, nerr, matched = marker.creverse.BestMatch(aseq, start, -1) - - if matched { - - // extracting primer matches - reverse, _ := sequence.Subsequence(start, end, false) - defer reverse.Recycle() - reverse = reverse.ReverseComplement(true) - endtag := min(end+marker.taglength, sequence.Len()) - rtag, err := sequence.Subsequence(end, endtag, false) - defer rtag.Recycle() - srtag := "" - - if err != nil { - rtag = nil - } else { - rtag.ReverseComplement(true) - srtag = strings.ToLower(rtag.String()) - } - - m.ReverseMatch = strings.ToLower(reverse.String()) - m.ReverseMismatches = nerr - m.BarcodeEnd = start - m.ReverseTag = srtag - - sample, ok := marker.samples[TagPair{ftag, srtag}] - - if ok { - m.Pcr = sample - } - - return &m - - } - - m.Error = fmt.Errorf("cannot locates reverse priming site") - - return &m - } - - // - // At this point the forward primer didn't match. - // We try now with the reverse primer - // - - start, end, nerr, matched = marker.reverse.BestMatch(aseq, marker.taglength, -1) - - if matched { - if start < 0 { - start = 0 - } - - if end > sequence.Len() { - end = sequence.Len() - } - - sseq := sequence.String() - - reverse := strings.ToLower(sseq[start:end]) - tagstart := max(start-marker.taglength, 0) - rtag := strings.ToLower(sseq[tagstart:start]) - - m := DemultiplexMatch{ - ReverseMatch: reverse, - ReverseTag: rtag, - BarcodeStart: end, - ReverseMismatches: nerr, - IsDirect: false, - Error: nil, - } - - start, end, nerr, matched = marker.cforward.BestMatch(aseq, end, -1) - - if matched { - - direct, _ := sequence.Subsequence(start, end, false) - defer direct.Recycle() - direct = direct.ReverseComplement(true) - - endtag := min(end+marker.taglength, sequence.Len()) - ftag, err := sequence.Subsequence(end, endtag, false) - defer ftag.Recycle() - sftag := "" - if err != nil { - ftag = nil - - } else { - ftag = ftag.ReverseComplement(true) - sftag = strings.ToLower(ftag.String()) - } - - m.ForwardMatch = direct.String() - m.ForwardTag = sftag - m.ForwardMismatches = nerr - m.BarcodeEnd = start - - sample, ok := marker.samples[TagPair{sftag, rtag}] - - if ok { - m.Pcr = sample - } - - return &m - } - - m.Error = fmt.Errorf("cannot locates forward priming site") - - return &m - } - - return nil -} - // ExtractBarcode extracts the barcode from the given biosequence. // // Parameters: diff --git a/pkg/obingslibrary/multimatch.go b/pkg/obingslibrary/multimatch.go new file mode 100644 index 0000000..6cb18f6 --- /dev/null +++ b/pkg/obingslibrary/multimatch.go @@ -0,0 +1,186 @@ +package obingslibrary + +import ( + "sort" + + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +type PrimerMatch struct { + Begin int + End int + Mismatches int + Marker int + Forward bool +} + +type TagMatcher func( + sequence *obiseq.BioSequence, + begin, end int, forward bool) (TagPair, error) + +// func (library *NGSLibrary) MakeTagMatcherFixedLength() TagMatcher { +// return func(sequence *obiseq.BioSequence, begin, end int, forward bool) (TagPair, error) { +// fb := 0 +// fe := 0 +// if forward { +// fb = begin - library.Forward_spacer - library.Forward_tag_length +// } else { +// fb = begin - library.Reverse_spacer - library.Reverse_tag_length +// } + +// if fb < 0 { +// return TagPair{}, fmt.Errorf("begin too small") +// } +// if forward { +// fe = end + library.Reverse_tag_length + library.Reverse_spacer +// } else { +// fe = end + library.Forward_tag_length + library.Forward_spacer +// } + +// if fe > len(sequence.String()) { +// return TagPair{}, fmt.Errorf("end too large") +// } + +// ftag := sequence.String()[fb:begin] +// rtag, err := sequence.Subsequence(end, fe, true) + +// if err != nil { +// return TagPair{}, fmt.Errorf("error in subsequence : %v", err) +// } + +// return TagPair{Forward: ftag, Reverse: rtag.String()}, nil +// } + +// } + +func (library *NGSLibrary) ExtractMultiBarcode(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + i := 1 + markers := make([]*Marker, len(library.Markers)+1) + primerseqs := make([]PrimerPair, len(library.Markers)+1) + matches := make([]PrimerMatch, 0, len(library.Markers)+1) + aseq, err := obiapat.MakeApatSequence(sequence, false) + + if err != nil { + log.Fatalf("error in building apat sequence : %v\n", err) + } + + for primers, marker := range library.Markers { + markers[i] = marker + primerseqs[i] = primers + locs := marker.forward.AllMatches(aseq, 0, -1) + if len(locs) > 0 { + for _, loc := range locs { + matches = append(matches, PrimerMatch{ + Begin: loc[0], + End: loc[1], + Mismatches: loc[2], + Marker: i, + Forward: true, + }) + + } + + locs = marker.creverse.AllMatches(aseq, locs[0][0]+1, -1) + + if len(locs) > 0 { + for _, loc := range locs { + matches = append(matches, PrimerMatch{ + Begin: loc[0], + End: loc[1], + Mismatches: loc[2], + Marker: -i, + Forward: true, + }) + } + } + } + + locs = marker.reverse.AllMatches(aseq, 0, -1) + if len(locs) > 0 { + for _, loc := range locs { + matches = append(matches, PrimerMatch{ + Begin: loc[0], + End: loc[1], + Mismatches: loc[2], + Marker: i, + Forward: false, + }) + } + + locs = marker.cforward.AllMatches(aseq, locs[0][0]+1, -1) + + if len(locs) > 0 { + for _, loc := range locs { + matches = append(matches, PrimerMatch{ + Begin: loc[0], + End: loc[1], + Mismatches: loc[2], + Marker: -i, + Forward: false, + }) + } + } + } + i++ + } + + if len(matches) > 0 { + sort.Slice(matches, func(i, j int) bool { + return matches[i].Begin < matches[j].Begin + }) + + state := 0 + var from PrimerMatch + q := 0 + for _, match := range matches { + + switch state { + case 0: + if match.Marker > 0 { + from = match + state = 1 + } + + case 1: + if match.Marker == -from.Marker && match.Forward == from.Forward { + q++ + log.Infof("%d -- %s [%s:%s] %s : %d -> %d mismatches : %d:%d", + q, + sequence.Id(), + primerseqs[from.Marker].Forward, + primerseqs[from.Marker].Reverse, + map[bool]string{true: "forward", false: "reverse"}[from.Forward], + from.End, + match.Begin-1, + from.Mismatches, + match.Mismatches, + ) + state = 0 + } else if match.Marker > 0 { + log.Warnf("Marker mismatch : %d %d", match.Marker, from.Marker) + from = match + } else { + log.Warnf("Marker mismatch : %d %d", match.Marker, from.Marker) + state = 0 + } + } + } + } + + return nil, nil +} + +func (library *NGSLibrary) ExtractMultiBarcodeSliceWorker(options ...WithOption) obiseq.SeqSliceWorker { + opt := MakeOptions(options) + + library.Compile(opt.AllowedMismatch(), opt.AllowsIndel()) + + worker := func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + return library.ExtractMultiBarcode(sequence) + } + + return obiseq.SeqToSliceWorker(worker, true) +} diff --git a/pkg/obingslibrary/ngslibrary.go b/pkg/obingslibrary/ngslibrary.go index 31ca20f..8b8c7ae 100644 --- a/pkg/obingslibrary/ngslibrary.go +++ b/pkg/obingslibrary/ngslibrary.go @@ -1,7 +1,8 @@ package obingslibrary import ( - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" + "fmt" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" ) @@ -22,44 +23,112 @@ type PCR struct { Annotations obiseq.Annotation } -type Marker struct { - forward obiapat.ApatPattern - cforward obiapat.ApatPattern - reverse obiapat.ApatPattern - creverse obiapat.ApatPattern - taglength int - samples map[TagPair]*PCR +type NGSLibrary struct { + Matching string + Primers map[string]PrimerPair + Markers map[PrimerPair]*Marker } -type NGSLibrary map[PrimerPair]*Marker func MakeNGSLibrary() NGSLibrary { - return make(NGSLibrary, 10) + return NGSLibrary{ + Primers: make(map[string]PrimerPair, 10), + Markers: make(map[PrimerPair]*Marker, 10), + } } func (library *NGSLibrary) GetMarker(forward, reverse string) (*Marker, bool) { pair := PrimerPair{forward, reverse} - marker, ok := (*library)[pair] + marker, ok := (library.Markers)[pair] if ok { return marker, true } - m := Marker{samples: make(map[TagPair]*PCR, 1000)} - (*library)[pair] = &m + m := Marker{ + Forward_tag_length: 0, + Reverse_tag_length: 0, + Forward_spacer: 0, + Reverse_spacer: 0, + Forward_tag_delimiter: 0, + Reverse_tag_delimiter: 0, + samples: make(map[TagPair]*PCR, 1000), + } + + (library.Markers)[pair] = &m return &m, false } -func (marker *Marker) GetPCR(forward, reverse string) (*PCR, bool) { - pair := TagPair{forward, reverse} - pcr, ok := marker.samples[pair] +func (library *NGSLibrary) SetForwardTagSpacer(spacer int) { + for _, marker := range library.Markers { + marker.SetForwardTagSpacer(spacer) + } +} + +func (library *NGSLibrary) SetReverseTagSpacer(spacer int) { + for _, marker := range library.Markers { + marker.SetReverseTagSpacer(spacer) + } +} + +func (library *NGSLibrary) SetTagSpacer(spacer int) { + library.SetForwardTagSpacer(spacer) + library.SetReverseTagSpacer(spacer) +} + +func (library *NGSLibrary) SetTagSpacerFor(primer string, spacer int) { + primers, ok := library.Primers[primer] if ok { - return pcr, ok + marker, ok := library.Markers[primers] + + if ok { + if primer == primers.Forward { + marker.SetForwardTagSpacer(spacer) + } else { + marker.SetReverseTagSpacer(spacer) + } + } } - - ipcr := PCR{} - marker.samples[pair] = &ipcr - - return &ipcr, false +} + +func (library *NGSLibrary) SetForwardTagDelimiter(delim byte) { + for _, marker := range library.Markers { + marker.SetForwardTagDelimiter(delim) + } +} + +func (library *NGSLibrary) SetReverseTagDelimiter(delim byte) { + for _, marker := range library.Markers { + marker.SetReverseTagDelimiter(delim) + } +} + +func (library *NGSLibrary) SetTagDelimiter(delim byte) { + library.SetForwardTagDelimiter(delim) + library.SetReverseTagDelimiter(delim) +} + +func (library *NGSLibrary) CheckTagLength() { + + for _, marker := range library.Markers { + marker.CheckTagLength() + } +} + +func (library *NGSLibrary) CheckPrimerUnicity() error { + for primers := range library.Markers { + if _, ok := library.Primers[primers.Forward]; ok { + return fmt.Errorf("forward primer %s is used more than once", primers.Forward) + } + if _, ok := library.Primers[primers.Reverse]; ok { + return fmt.Errorf("reverse primer %s is used more than once", primers.Reverse) + } + if primers.Forward == primers.Reverse { + return fmt.Errorf("forward and reverse primers are the same : %s", primers.Forward) + } + library.Primers[primers.Forward] = primers + library.Primers[primers.Reverse] = primers + } + return nil } diff --git a/pkg/obingslibrary/worker.go b/pkg/obingslibrary/worker.go index 0ff065e..91e0e01 100644 --- a/pkg/obingslibrary/worker.go +++ b/pkg/obingslibrary/worker.go @@ -139,7 +139,7 @@ func MakeOptions(setters []WithOption) Options { return opt } -func _ExtractBarcodeSlice(ngslibrary NGSLibrary, +func _ExtractBarcodeSlice(ngslibrary *NGSLibrary, sequences obiseq.BioSequenceSlice, options Options) obiseq.BioSequenceSlice { newSlice := make(obiseq.BioSequenceSlice, 0, len(sequences)) @@ -154,7 +154,7 @@ func _ExtractBarcodeSlice(ngslibrary NGSLibrary, return newSlice } -func ExtractBarcodeSlice(ngslibrary NGSLibrary, +func ExtractBarcodeSlice(ngslibrary *NGSLibrary, sequences obiseq.BioSequenceSlice, options ...WithOption) obiseq.BioSequenceSlice { @@ -165,7 +165,7 @@ func ExtractBarcodeSlice(ngslibrary NGSLibrary, return _ExtractBarcodeSlice(ngslibrary, sequences, opt) } -func ExtractBarcodeSliceWorker(ngslibrary NGSLibrary, +func ExtractBarcodeSliceWorker(ngslibrary *NGSLibrary, options ...WithOption) obiseq.SeqSliceWorker { opt := MakeOptions(options) diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index db7494f..ba33696 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "b842d60" +var _Commit = "bfe3d0e" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/biosequence.go b/pkg/obiseq/biosequence.go index 2cab788..04d4185 100644 --- a/pkg/obiseq/biosequence.go +++ b/pkg/obiseq/biosequence.go @@ -161,7 +161,7 @@ func (sequence *BioSequence) Recycle() { // Copy returns a new BioSequence that is a copy of the original BioSequence. // -// It copies the id and definition fields of the original BioSequence to the new BioSequence. +// It copies the id of the original BioSequence to the new BioSequence. // It also creates new slices and copies the values from the original BioSequence's sequence, qualities, and feature fields to the new BioSequence. // If the original BioSequence has annotations, it locks the annot_lock and copies the annotations to the new BioSequence. // @@ -170,15 +170,14 @@ func (s *BioSequence) Copy() *BioSequence { newSeq := NewEmptyBioSequence(0) newSeq.id = s.id - //newSeq.definition = s.definition newSeq.sequence = CopySlice(s.sequence) newSeq.qualities = CopySlice(s.qualities) newSeq.feature = CopySlice(s.feature) if len(s.annotations) > 0 { - defer s.annot_lock.Unlock() s.annot_lock.Lock() + defer s.annot_lock.Unlock() newSeq.annotations = GetAnnotation(s.annotations) } @@ -210,6 +209,10 @@ func (s *BioSequence) Definition() string { return definition } +func (s *BioSequence) HasDefinition() bool { + return s.HasAttribute("definition") +} + // HasSequence checks if the BioSequence has a sequence. // // No parameters. @@ -420,8 +423,7 @@ func (s *BioSequence) SetSequence(sequence []byte) { if s.sequence != nil { RecycleSlice(&s.sequence) } - s.sequence = GetSlice(len(sequence))[0:len(sequence)] - copy(s.sequence, obiutils.InPlaceToLower(sequence)) + s.sequence = CopySlice(obiutils.InPlaceToLower(sequence)) } // Setting the qualities of the BioSequence. @@ -429,8 +431,7 @@ func (s *BioSequence) SetQualities(qualities Quality) { if s.qualities != nil { RecycleSlice(&s.qualities) } - s.qualities = GetSlice(len(qualities))[0:len(qualities)] - copy(s.qualities, qualities) + s.qualities = CopySlice(qualities) } // A method that appends a byte slice to the qualities of the BioSequence. diff --git a/pkg/obitools/obimultiplex/options.go b/pkg/obitools/obimultiplex/options.go index 8f590fa..a62835f 100644 --- a/pkg/obitools/obimultiplex/options.go +++ b/pkg/obitools/obimultiplex/options.go @@ -77,7 +77,7 @@ func CLIHasNGSFilterFile() bool { return _NGSFilterFile != "" } -func CLINGSFIlter() (obingslibrary.NGSLibrary, error) { +func CLINGSFIlter() (*obingslibrary.NGSLibrary, error) { file, err := os.Open(_NGSFilterFile) if err != nil { diff --git a/pkg/obitools/obimultiplex2/demultiplex.go b/pkg/obitools/obimultiplex2/demultiplex.go new file mode 100644 index 0000000..4f3d692 --- /dev/null +++ b/pkg/obitools/obimultiplex2/demultiplex.go @@ -0,0 +1,60 @@ +package obimultiplex2 + +import ( + log "github.com/sirupsen/logrus" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" +) + +func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error) { + + opts := make([]obingslibrary.WithOption, 0, 10) + + opts = append(opts, + obingslibrary.OptionAllowedMismatches(CLIAllowedMismatch()), + obingslibrary.OptionAllowedIndel(CLIAllowsIndel()), + obingslibrary.OptionUnidentified(CLIUnidentifiedFileName()), + obingslibrary.OptionDiscardErrors(!CLIConservedErrors()), + obingslibrary.OptionParallelWorkers(obioptions.CLIParallelWorkers()), + obingslibrary.OptionBatchSize(obioptions.CLIBatchSize()), + ) + + ngsfilter, err := CLINGSFIlter() + if err != nil { + log.Fatalf("%v", err) + } + + worker := ngsfilter.ExtractMultiBarcodeSliceWorker(opts...) + + newIter := iterator.MakeISliceWorker(worker, false) + + if !CLIConservedErrors() { + log.Infoln("Discards unassigned sequences") + newIter = newIter.FilterOn(obiseq.HasAttribute("demultiplex_error").Not(), obioptions.CLIBatchSize()) + } + + var unidentified obiiter.IBioSequence + if CLIUnidentifiedFileName() != "" { + log.Printf("Unassigned sequences saved in file: %s\n", CLIUnidentifiedFileName()) + unidentified, newIter = newIter.DivideOn(obiseq.HasAttribute("demultiplex_error"), + obioptions.CLIBatchSize()) + + go func() { + _, err := obiconvert.CLIWriteBioSequences(unidentified, + true, + CLIUnidentifiedFileName()) + + if err != nil { + log.Fatalf("%v", err) + } + }() + + } + log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers()) + + return newIter, nil +} diff --git a/pkg/obitools/obimultiplex2/options.go b/pkg/obitools/obimultiplex2/options.go new file mode 100644 index 0000000..afa56d9 --- /dev/null +++ b/pkg/obitools/obimultiplex2/options.go @@ -0,0 +1,108 @@ +package obimultiplex2 + +import ( + "fmt" + "os" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" + + log "github.com/sirupsen/logrus" +) + +var _NGSFilterFile = "" +var _askTemplate = false +var _UnidentifiedFile = "" +var _AllowedMismatch = int(2) +var _AllowsIndel = false +var _ConservedError = false + +// PCROptionSet defines every options related to a simulated PCR. +// +// The function adds to a CLI every options proposed to the user +// to tune the parametters of the PCR simulation algorithm. +// +// # Parameters +// +// - option : is a pointer to a getoptions.GetOpt instance normaly +// produced by the +func MultiplexOptionSet(options *getoptions.GetOpt) { + options.StringVar(&_NGSFilterFile, "tag-list", _NGSFilterFile, + options.Alias("t"), + options.Description("File name of the NGSFilter file describing PCRs.")) + + options.BoolVar(&_ConservedError, "keep-errors", _ConservedError, + options.Description("Prints symbol counts.")) + + options.BoolVar(&_AllowsIndel, "with-indels", _AllowsIndel, + options.Description("Allows for indels during the primers matching.")) + + options.StringVar(&_UnidentifiedFile, "unidentified", _UnidentifiedFile, + options.Alias("u"), + options.Description("Filename used to store the sequences unassigned to any sample.")) + + options.IntVar(&_AllowedMismatch, "allowed-mismatches", _AllowedMismatch, + options.Alias("e"), + options.Description("Used to specify the number of errors allowed for matching primers.")) + + options.BoolVar(&_askTemplate, "template", _askTemplate, + options.Description("Print on the standard output an example of CSV configuration file."), + ) + +} + +func OptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + MultiplexOptionSet(options) +} + +func CLIAllowedMismatch() int { + return _AllowedMismatch +} + +func CLIAllowsIndel() bool { + return _AllowsIndel +} +func CLIUnidentifiedFileName() string { + return _UnidentifiedFile +} + +func CLIConservedErrors() bool { + return _UnidentifiedFile != "" || _ConservedError +} + +func CLIHasNGSFilterFile() bool { + return _NGSFilterFile != "" +} + +func CLINGSFIlter() (*obingslibrary.NGSLibrary, error) { + file, err := os.Open(_NGSFilterFile) + + if err != nil { + return nil, fmt.Errorf("open file error: %v", err) + } + + log.Infof("Reading NGSFilter file: %s", _NGSFilterFile) + ngsfiler, err := obiformats.ReadNGSFilter(file) + + if err != nil { + return nil, fmt.Errorf("NGSfilter reading file error: %v", err) + } + + return ngsfiler, nil +} + +func CLIAskConfigTemplate() bool { + return _askTemplate +} + +func CLIConfigTemplate() string { + return `experiment,sample,sample_tag,forward_primer,reverse_primer +wolf_diet,13a_F730603,aattaac,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG +wolf_diet,15a_F730814,gaagtag:gaatatc,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG +wolf_diet,26a_F040644,gaatatc:-,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG +wolf_diet,29a_F260619,-:-,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG +` +} diff --git a/pkg/obitools/obitagpcr/pcrtag.go b/pkg/obitools/obitagpcr/pcrtag.go index d8cbd40..d7eb6c2 100644 --- a/pkg/obitools/obitagpcr/pcrtag.go +++ b/pkg/obitools/obitagpcr/pcrtag.go @@ -33,12 +33,6 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence, newIter := obiiter.MakeIBioSequence() newIter.MarkAsPaired() - newIter.Add(nworkers) - - go func() { - newIter.WaitAndClose() - log.Printf("End of the sequence PCR Taging") - }() f := func(iterator obiiter.IBioSequence, wid int) { arena := obialign.MakePEAlignArena(150, 150) @@ -128,16 +122,22 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence, log.Printf("Start of the sequence Pairing using %d workers\n", nworkers) + newIter.Add(nworkers) for i := 1; i < nworkers; i++ { go f(iterator.Split(), i) } go f(iterator, 0) + go func() { + newIter.WaitAndClose() + log.Printf("End of the sequence PCR Taging") + }() + iout := newIter if !obimultiplex.CLIConservedErrors() { log.Println("Discards unassigned sequences") - iout = iout.Rebatch(obioptions.CLIBatchSize()) + iout = iout.FilterOn(obiseq.HasAttribute("demultiplex_error").Not(), obioptions.CLIBatchSize()) } var unidentified obiiter.IBioSequence diff --git a/pkg/obiutils/minmax.go b/pkg/obiutils/minmax.go index 1c48440..ca9adc8 100644 --- a/pkg/obiutils/minmax.go +++ b/pkg/obiutils/minmax.go @@ -1,6 +1,7 @@ package obiutils import ( + log "github.com/sirupsen/logrus" "golang.org/x/exp/constraints" ) @@ -29,3 +30,20 @@ func MinMaxSlice[T constraints.Ordered](vec []T) (min, max T) { return } + +func MaxMap[K comparable, T constraints.Ordered](values map[K]T) (K, T) { + + if len(values) == 0 { + log.Panicf("empty map") + } + + var maxKey K + var maxValue T + for k, v := range values { + if v > maxValue { + maxValue = v + maxKey = k + } + } + return maxKey, maxValue +}