diff --git a/Release-notes.md b/Release-notes.md index bfdcfd8..da0c138 100644 --- a/Release-notes.md +++ b/Release-notes.md @@ -2,6 +2,12 @@ ## Latest changes +### New features + +- Most of the time obitools identify automatically sequence file format. But + it fails sometimes. Two new option **--fasta** and **--fastq** are added to + allow the processing of the rare fasta and fastq files not recognized. + ## August 2nd, 2024. Release 4.3.0 ### Change of git repositiory diff --git a/pkg/obialign/locatepattern.go b/pkg/obialign/locatepattern.go index d6057ee..2a8f1de 100644 --- a/pkg/obialign/locatepattern.go +++ b/pkg/obialign/locatepattern.go @@ -25,17 +25,18 @@ func buffIndex(i, j, width int) int { // // The function returns the start and end positions of the best // match, as well as the number of errors in the best match. -func LocatePattern(pattern, sequence []byte) (int, int, int) { +func LocatePattern(id string, pattern, sequence []byte) (int, int, int) { + + if len(pattern) >= len(sequence) { + log.Panicf("Sequence %s:Pattern %s must be shorter than sequence %s", id, pattern, sequence) + } + // Pattern spreads over the columns // Sequence spreads over the rows width := len(pattern) + 1 buffsize := (len(pattern) + 1) * (len(sequence) + 1) buffer := make([]int, buffsize) - if len(pattern) >= len(sequence) { - log.Panicf("Pattern %s must be shorter than sequence %s", pattern, sequence) - } - // The path matrix keeps track of the best path through the matrix // 0 : indicate the diagonal path // 1 : indicate the up path diff --git a/pkg/obiapat/pattern.go b/pkg/obiapat/pattern.go index 6900c62..2edbac8 100644 --- a/pkg/obiapat/pattern.go +++ b/pkg/obiapat/pattern.go @@ -297,6 +297,24 @@ func (pattern ApatPattern) FindAllIndex(sequence ApatSequence, begin, length int return loc } +func (pattern ApatPattern) IsMatching(sequence ApatSequence, begin, length int) bool { + if begin < 0 { + begin = 0 + } + + if length < 0 { + length = sequence.Len() + } + + nhits := int(C.ManberAll(sequence.pointer.pointer, + pattern.pointer.pointer, + 0, + C.int32_t(begin), + C.int32_t(length+C.MAX_PAT_LEN))) + + return nhits > 0 +} + // BestMatch finds the best match of a given pattern in a sequence. // // THe function identify the first occurrence of the pattern in the sequence. @@ -336,6 +354,11 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) ( nerr = best[2] end = best[1] + if best[0] < 0 || best[1] > sequence.Len() { + matched = false + return + } + if nerr == 0 || !pattern.pointer.pointer.hasIndel { start = best[0] log.Debugln("No nws ", start, nerr) @@ -356,14 +379,16 @@ func (pattern ApatPattern) BestMatch(sequence ApatSequence, begin, length int) ( best[0], nerr, int(pattern.pointer.pointer.patlen), sequence.Len(), start, end) - from, to, score := obialign.LocatePattern((*cpattern)[0:int(pattern.pointer.pointer.patlen)], frg) + from, to, score := obialign.LocatePattern(sequence.pointer.reference.Id(), + (*cpattern)[0:int(pattern.pointer.pointer.patlen)], + frg) // olderr := m[2] nerr = score start = start + from end = start + to - log.Debugln("results", score, start, nerr) + log.Debugf("BestMatch on %s : score=%d [%d..%d]", sequence.pointer.reference.Id(), score, start, nerr) return } @@ -454,7 +479,10 @@ func (pattern ApatPattern) AllMatches(sequence ApatSequence, begin, length int) cpattern := (*[1 << 30]byte)(unsafe.Pointer(pattern.pointer.pointer.cpat)) frg := sequence.pointer.reference.Sequence()[start:end] - pb, pe, score := obialign.LocatePattern((*cpattern)[0:int(pattern.pointer.pointer.patlen)], frg) + pb, pe, score := obialign.LocatePattern( + sequence.pointer.reference.Id(), + (*cpattern)[0:int(pattern.pointer.pointer.patlen)], + frg) // olderr := m[2] m[2] = score diff --git a/pkg/obiapat/predicat.go b/pkg/obiapat/predicat.go new file mode 100644 index 0000000..7ff5a36 --- /dev/null +++ b/pkg/obiapat/predicat.go @@ -0,0 +1,40 @@ +package obiapat + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + log "github.com/sirupsen/logrus" +) + +func IsPatternMatchSequence(pattern string, errormax int, bothStrand, allowIndels bool) obiseq.SequencePredicate { + + pat, err := MakeApatPattern(pattern, errormax, allowIndels) + + if err != nil { + log.Fatalf("error in sequence regular pattern syntax : %v", err) + } + + cpat, err := pat.ReverseComplement() + + if err != nil { + log.Fatalf("cannot reverse complement the pattern : %v", err) + } + + f := func(sequence *obiseq.BioSequence) bool { + aseq, err := MakeApatSequence(sequence, false) + + if err != nil { + log.Panicf("Cannot convert sequence %s to apat format", sequence.Id()) + } + + match := pat.IsMatching(aseq, 0, aseq.Len()) + + if !match && bothStrand { + + match = cpat.IsMatching(aseq, 0, aseq.Len()) + } + + return match + } + + return f +} diff --git a/pkg/obiformats/universal_read.go b/pkg/obiformats/universal_read.go index 07cad24..6ee0750 100644 --- a/pkg/obiformats/universal_read.go +++ b/pkg/obiformats/universal_read.go @@ -107,7 +107,7 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) { mimetype.Lookup("application/octet-stream").Extend(csv, "text/csv", ".csv") // Create a buffer to store the read data - buf := make([]byte, 1024*128) + buf := make([]byte, 1024*1024) n, err := io.ReadFull(stream, buf) if err != nil && err != io.ErrUnexpectedEOF { diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index ba4d997..075fddf 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "373464c" +var _Commit = "65ae826" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obiseq/predicate.go b/pkg/obiseq/predicate.go index 0f00080..4fc8cf7 100644 --- a/pkg/obiseq/predicate.go +++ b/pkg/obiseq/predicate.go @@ -209,7 +209,6 @@ func IsSequenceMatch(pattern string) SequencePredicate { return f } - func IsDefinitionMatch(pattern string) SequencePredicate { pat, err := regexp.Compile(pattern) diff --git a/pkg/obitools/obiannotate/obiannotate.go b/pkg/obitools/obiannotate/obiannotate.go index 585812c..a3ea644 100644 --- a/pkg/obitools/obiannotate/obiannotate.go +++ b/pkg/obitools/obiannotate/obiannotate.go @@ -25,7 +25,7 @@ func DeleteAttributesWorker(toBeDeleted []string) obiseq.SeqWorker { return f } -func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) obiseq.SeqWorker { +func MatchPatternWorker(pattern, name string, errormax int, bothStrand, allowsIndel bool) obiseq.SeqWorker { pat, err := obiapat.MakeApatPattern(pattern, errormax, allowsIndel) if err != nil { log.Fatalf("error in compiling pattern (%s) : %v", pattern, err) @@ -56,7 +56,7 @@ func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) ob start, end, nerr, matched := pat.BestMatch(apats, 0, s.Len()) - if matched { + if matched && start >= 0 && end <= s.Len() { annot := s.Annotations() annot[slot] = pattern @@ -75,7 +75,7 @@ func MatchPatternWorker(pattern, name string, errormax int, allowsIndel bool) ob } else { start, end, nerr, matched := cpat.BestMatch(apats, 0, s.Len()) - if matched { + if matched && start >= 0 && end <= s.Len() { annot := s.Annotations() annot[slot] = pattern match, err := s.Subsequence(start, end, false) @@ -328,9 +328,10 @@ func CLIAnnotationWorker() obiseq.SeqWorker { } if CLIHasPattern() { - log.Infof("Match pattern %s with %d error", CLIPattern(), CLIPatternError()) + log.Infof("Match pattern %s with %d error", CLIPattern(), obigrep.CLIPatternError()) w := MatchPatternWorker(CLIPattern(), CLIHasPatternName(), - CLIPatternError(), CLIPatternInDels()) + obigrep.CLIPatternError(), obigrep.CLIPatternBothStrand(), + obigrep.CLIPatternInDels()) annotator = annotator.ChainWorkers(w) } diff --git a/pkg/obitools/obiannotate/options.go b/pkg/obitools/obiannotate/options.go index f7e722e..071cdaa 100644 --- a/pkg/obitools/obiannotate/options.go +++ b/pkg/obitools/obiannotate/options.go @@ -24,8 +24,6 @@ var _setSeqLength = false var _uniqueID = false var _ahoCorazick = "" var _pattern = "" -var _pattern_error = 0 -var _pattern_indel = false var _pattern_name = "pattern" var _lcaSlot = "" var _lcaError = 0.0 @@ -62,14 +60,6 @@ func SequenceAnnotationOptionSet(options *getoptions.GetOpt) { options.Description("specify the name to use as prefix for the slots reporting the match"), ) - options.IntVar(&_pattern_error, "pattern-error", _pattern_error, - options.Description("Maximum number of allowed error during pattern matching"), - ) - - options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel, - options.Description("Allows for indel during pattern matching"), - ) - options.StringVar(&_lcaSlot, "add-lca-in", _lcaSlot, options.ArgName("SLOT_NAME"), options.Description("From the taxonomic annotation of the sequence (taxid slot or merged_taxid slot), "+ @@ -304,14 +294,6 @@ func CLIHasPatternName() string { return _pattern_name } -func CLIPatternError() int { - return _pattern_error -} - -func CLIPatternInDels() bool { - return _pattern_indel -} - func CLISetTaxonomicPath() bool { return _taxonomicPath } diff --git a/pkg/obitools/obiconvert/options.go b/pkg/obitools/obiconvert/options.go index a552e7f..2f2c2f1 100644 --- a/pkg/obitools/obiconvert/options.go +++ b/pkg/obitools/obiconvert/options.go @@ -19,6 +19,8 @@ var __input_fastobi_format__ = false var __input_ecopcr_format__ = false var __input_embl_format__ = false var __input_genbank_format__ = false +var __input_fastq_format__ = false +var __input_fasta_format__ = false var __output_in_fasta__ = false var __output_in_fastq__ = false @@ -56,6 +58,12 @@ func InputOptionSet(options *getoptions.GetOpt) { options.BoolVar(&__input_genbank_format__, "genbank", __input_genbank_format__, options.Description("Read data following the Genbank flatfile format.")) + options.BoolVar(&__input_fastq_format__, "fastq", __input_fastq_format__, + options.Description("Read data following the fastq format.")) + + options.BoolVar(&__input_fasta_format__, "fasta", __input_fasta_format__, + options.Description("Read data following the fasta format.")) + options.BoolVar(&__no_ordered_input__, "no-order", __no_ordered_input__, options.Description("When several input files are provided, "+ "indicates that there is no order among them.")) @@ -116,6 +124,10 @@ func OptionSet(options *getoptions.GetOpt) { // file has to be printed. func CLIInputFormat() string { switch { + case __input_fasta_format__: + return "fasta" + case __input_fastq_format__: + return "fastq" case __input_ecopcr_format__: return "ecopcr" case __input_embl_format__: diff --git a/pkg/obitools/obiconvert/sequence_reader.go b/pkg/obitools/obiconvert/sequence_reader.go index 4f72971..83b22fe 100644 --- a/pkg/obitools/obiconvert/sequence_reader.go +++ b/pkg/obitools/obiconvert/sequence_reader.go @@ -138,6 +138,10 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) { } switch CLIInputFormat() { + case "fastq": + reader = obiformats.ReadFastqFromFile + case "fasta": + reader = obiformats.ReadFastaFromFile case "ecopcr": reader = obiformats.ReadEcoPCRFromFile case "embl": diff --git a/pkg/obitools/obigrep/options.go b/pkg/obitools/obigrep/options.go index 7b5cf6c..8295dc4 100644 --- a/pkg/obitools/obigrep/options.go +++ b/pkg/obitools/obigrep/options.go @@ -6,6 +6,7 @@ import ( log "github.com/sirupsen/logrus" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats/ncbitaxdump" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" @@ -43,6 +44,11 @@ var _SaveRejected = "" var _PairedMode = "forward" +var _approx_pattern = make([]string, 0) +var _pattern_error = 0 +var _pattern_indel = false +var _pattern_only_forward = false + func TaxonomySelectionOptionSet(options *getoptions.GetOpt) { options.StringVar(&_Taxdump, "taxdump", _Taxdump, @@ -143,6 +149,23 @@ func SequenceSelectionOptionSet(options *getoptions.GetOpt) { options.Description("If paired reads are passed to obibrep, that option determines how the conditions "+ "are applied to both reads."), ) + + options.StringSliceVar(&_approx_pattern, "approx-pattern", 1, 1, + options.ArgName("PATTERN"), + options.Description("Regular expression pattern to be tested against the sequence itself. The pattern is case insensitive.")) + + options.IntVar(&_pattern_error, "pattern-error", _pattern_error, + options.Description("Maximum number of allowed error during pattern matching"), + ) + + options.BoolVar(&_pattern_indel, "allows-indels", _pattern_indel, + options.Description("Allows for indel during pattern matching"), + ) + + options.BoolVar(&_pattern_only_forward, "only-forward", _pattern_only_forward, + options.Description("Look for pattern only on forward strand"), + ) + } // OptionSet adds to the basic option set every options declared for @@ -212,6 +235,18 @@ func CLISelectedNCBITaxDump() string { return _Taxdump } +func CLIPatternError() int { + return _pattern_error +} + +func CLIPatternInDels() bool { + return _pattern_indel +} + +func CLIPatternBothStrand() bool { + return !_pattern_only_forward +} + func CLILoadSelectedTaxonomy() *obitax.Taxonomy { if CLISelectedNCBITaxDump() != "" { if _Taxonomy == nil { @@ -327,6 +362,30 @@ func CLISequencePatternPredicate() obiseq.SequencePredicate { return nil } +func CLISequenceAgrep() obiseq.SequencePredicate { + if len(_approx_pattern) > 0 { + p := obiapat.IsPatternMatchSequence( + _approx_pattern[0], + CLIPatternError(), + CLIPatternBothStrand(), + CLIPatternInDels(), + ) + + for _, pattern := range _approx_pattern[1:] { + p = p.And(obiapat.IsPatternMatchSequence( + pattern, + CLIPatternError(), + CLIPatternBothStrand(), + CLIPatternInDels(), + )) + } + + return p + } + + return nil +} + func CLIDefinitionPatternPredicate() obiseq.SequencePredicate { if len(_DefinitionPatterns) > 0 { @@ -419,6 +478,7 @@ func CLISequenceSelectionPredicate() obiseq.SequencePredicate { p = p.And(CLIIdListPredicate()) p = p.And(CLIHasAttibutePredicate()) p = p.And(CLIIsAttibuteMatchPredicate()) + p = p.And(CLISequenceAgrep()) if _InvertMatch { p = p.Not()