diff --git a/cmd/obitools/obimultiplex2/main.go b/cmd/obitools/obimultiplex2/main.go deleted file mode 100644 index 6edb181..0000000 --- a/cmd/obitools/obimultiplex2/main.go +++ /dev/null @@ -1,56 +0,0 @@ -package main - -import ( - "fmt" - "os" - - log "github.com/sirupsen/logrus" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obimultiplex2" -) - -func main() { - - // f, err := os.Create("cpu.pprof") - // if err != nil { - // log.Fatal(err) - // } - // pprof.StartCPUProfile(f) - // defer pprof.StopCPUProfile() - - // ftrace, err := os.Create("cpu.trace") - // if err != nil { - // log.Fatal(err) - // } - // trace.Start(ftrace) - // defer trace.Stop() - - optionParser := obioptions.GenerateOptionParser(obimultiplex2.OptionSet) - - _, args := optionParser(os.Args) - - if obimultiplex2.CLIAskConfigTemplate() { - fmt.Print(obimultiplex2.CLIConfigTemplate()) - os.Exit(0) - } - - if !obimultiplex2.CLIHasNGSFilterFile() { - log.Error("You must provide a tag list file following the NGSFilter format") - os.Exit(1) - } - - sequences, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } - amplicons, _ := obimultiplex2.IExtractBarcode(sequences) - obiconvert.CLIWriteBioSequences(amplicons, true) - amplicons.Wait() - obiiter.WaitForLastPipe() - -} diff --git a/pkg/obitools/obimultiplex2/options.go b/doc/book/wolf_data/wolf_diet_ngsfilter.csv old mode 100644 new mode 100755 similarity index 61% rename from pkg/obitools/obimultiplex2/options.go rename to doc/book/wolf_data/wolf_diet_ngsfilter.csv index d4f1c3b..c2604ab --- a/pkg/obitools/obimultiplex2/options.go +++ b/doc/book/wolf_data/wolf_diet_ngsfilter.csv @@ -1,105 +1,4 @@ -package obimultiplex2 - -import ( - "fmt" - "os" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" - "github.com/DavidGamba/go-getoptions" - - log "github.com/sirupsen/logrus" -) - -var _NGSFilterFile = "" -var _askTemplate = false -var _UnidentifiedFile = "" -var _AllowedMismatch = -1 -var _AllowsIndel = false -var _ConservedError = false - -// PCROptionSet defines every options related to a simulated PCR. -// -// The function adds to a CLI every options proposed to the user -// to tune the parametters of the PCR simulation algorithm. -// -// # Parameters -// -// - option : is a pointer to a getoptions.GetOpt instance normaly -// produced by the -func MultiplexOptionSet(options *getoptions.GetOpt) { - options.StringVar(&_NGSFilterFile, "tag-list", _NGSFilterFile, - options.Alias("t"), - options.Description("File name of the NGSFilter file describing PCRs.")) - - options.BoolVar(&_ConservedError, "keep-errors", _ConservedError, - options.Description("Prints symbol counts.")) - - options.BoolVar(&_AllowsIndel, "with-indels", _AllowsIndel, - options.Description("Allows for indels during the primers matching.")) - - options.StringVar(&_UnidentifiedFile, "unidentified", _UnidentifiedFile, - options.Alias("u"), - options.Description("Filename used to store the sequences unassigned to any sample.")) - - options.IntVar(&_AllowedMismatch, "allowed-mismatches", _AllowedMismatch, - options.Alias("e"), - options.Description("Used to specify the number of errors allowed for matching primers.")) - - options.BoolVar(&_askTemplate, "template", _askTemplate, - options.Description("Print on the standard output an example of CSV configuration file."), - ) - -} - -func OptionSet(options *getoptions.GetOpt) { - obiconvert.OptionSet(options) - MultiplexOptionSet(options) -} - -func CLIAllowedMismatch() int { - return _AllowedMismatch -} - -func CLIAllowsIndel() bool { - return _AllowsIndel -} -func CLIUnidentifiedFileName() string { - return _UnidentifiedFile -} - -func CLIConservedErrors() bool { - return _UnidentifiedFile != "" || _ConservedError -} - -func CLIHasNGSFilterFile() bool { - return _NGSFilterFile != "" -} - -func CLINGSFIlter() (*obingslibrary.NGSLibrary, error) { - file, err := os.Open(_NGSFilterFile) - - if err != nil { - return nil, fmt.Errorf("open file error: %v", err) - } - - log.Infof("Reading NGSFilter file: %s", _NGSFilterFile) - ngsfiler, err := obiformats.ReadNGSFilter(file) - - if err != nil { - return nil, fmt.Errorf("NGSfilter reading file error: %v", err) - } - - return ngsfiler, nil -} - -func CLIAskConfigTemplate() bool { - return _askTemplate -} - -func CLIConfigTemplate() string { - return `### +### ### Example of NGSFilter CSV configuration file ### # @@ -192,11 +91,11 @@ func CLIConfigTemplate() string { # The sample tag must be unique in the library for a given pair of primers # + They can be a simple DNA word as here. This means that the same tag is used # for both primers. -# + It can be two DNA words separated by a colon. For example, aagtag:gaagtag. +# + It can be two DNA words separated by a colon. For example, `aagtag:gaagtag`. # This means that the first tag is used for the forward primer and the second for the # reverse primers. "aagtag" is the same as "aagtag:aagtag". # + In the two word syntax, if a primer forward or reverse is not tagged, its tag -# is replaced by a hyphen '-', for example 'aagtag:-' or '-:aagtag'. +# is replaced by a hyphen `-`, for example `aagtag:-` or `-:aagtag`. # For a given primer all the tags must have the same length. # - forward_primer: the forward primer sequence # - reverse_primer: the reverse primer sequence @@ -205,6 +104,4 @@ experiment,sample,sample_tag,forward_primer,reverse_primer wolf_diet,13a_F730603,aattaac,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG wolf_diet,15a_F730814,gaagtag,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG wolf_diet,26a_F040644,gaatatc,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG -wolf_diet,29a_F260619,gcctcct,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG -` -} +wolf_diet,29a_F260619,gcctcct,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG \ No newline at end of file diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index e6aa7b3..219ad66 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "a396240" +var _Commit = "748a235" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obitools/obimultiplex/demultiplex.go b/pkg/obitools/obimultiplex/demultiplex.go index 7b3f8ca..f7865b1 100644 --- a/pkg/obitools/obimultiplex/demultiplex.go +++ b/pkg/obitools/obimultiplex/demultiplex.go @@ -28,19 +28,20 @@ func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error log.Fatalf("%v", err) } - worker := obingslibrary.ExtractBarcodeSliceWorker(ngsfilter, opts...) + worker := ngsfilter.ExtractMultiBarcodeSliceWorker(opts...) newIter := iterator.MakeISliceWorker(worker, false) + out := newIter if !CLIConservedErrors() { - log.Println("Discards unassigned sequences") - newIter = newIter.Rebatch(obioptions.CLIBatchSize()) + log.Infoln("Discards unassigned sequences") + out = out.FilterOn(obiseq.HasAttribute("demultiplex_error").Not(), obioptions.CLIBatchSize()) } var unidentified obiiter.IBioSequence if CLIUnidentifiedFileName() != "" { log.Printf("Unassigned sequences saved in file: %s\n", CLIUnidentifiedFileName()) - unidentified, newIter = newIter.DivideOn(obiseq.HasAttribute("demultiplex_error"), + unidentified, out = newIter.DivideOn(obiseq.HasAttribute("demultiplex_error"), obioptions.CLIBatchSize()) go func() { @@ -56,5 +57,5 @@ func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error } log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers()) - return newIter, nil + return out, nil } diff --git a/pkg/obitools/obimultiplex/options.go b/pkg/obitools/obimultiplex/options.go index a62835f..d0ee643 100644 --- a/pkg/obitools/obimultiplex/options.go +++ b/pkg/obitools/obimultiplex/options.go @@ -15,7 +15,7 @@ import ( var _NGSFilterFile = "" var _askTemplate = false var _UnidentifiedFile = "" -var _AllowedMismatch = int(2) +var _AllowedMismatch = -1 var _AllowsIndel = false var _ConservedError = false @@ -99,10 +99,112 @@ func CLIAskConfigTemplate() bool { } func CLIConfigTemplate() string { - return `experiment,sample,sample_tag,forward_primer,reverse_primer + return `### +### Example of NGSFilter CSV configuration file +### +# +# The CSV file can contain comments starting with the # character +# and empty lines. +# At the top of the file a set of lines of three or four columns and having +# the first column containing @param can be used to define parameters +# for the obimultiplex tool. The structure of these lines is : +# +# @param,parameter_name,parameter_value +# @param,parameter_name,parameter_value1,parameter_value2 +# +# The following lines describes the PCR multiplexed in the sequencing library. +# The first line describes the columns of the CSV file and the following lines +# describe the PCR multiplexed. +# +# Five columns are expected : +# +# - experiment: the experiment name +# - sample: the sample (pcr) name +# - sample_tag: the tag identifying the sample +# - forward_primer: the forward primer sequence +# - reverse_primer: the reverse primer sequence +# +# Supplementary columns are allowed. Their names and content will be used to +# annotate the sequence corresponding to the sample, as the key=value; located +# after the @ sign did in the original ngsfilter file format. +# +### +### Description of the parameters +### +# +# The forward_spacer and the reverse_spacer allow to specify the number of +# nucleotide separating the 5' end of the forward or reverse primer respectively +# to the 3' end of the tag. The default value is 0. +# +# The param spacer allows for specify this value for both forward and reverse +# simultaneously. The spacer parameter can also, when used wirh two arguments, +# allow to specify the # the spacer value for a specific primer: +# +# @param,spacer,CAGCTGCTATGTCGATGCTGACT,2 +# +@param,forward_spacer,0 +@param,reverse_spacer,0 +# +# A new method for designing indel proof tag is to not use one of the four +# nucleotides in their sequence and to flank the tag with this fourth nucleotide. +# That nucleotide is the tag delimiter. Similarly, to the spacer value, +# three ways to specify the tag delimiter exist: +# - the forward_tag_delimiter and reverse_tag_delimiter +# - the tag_delimiter in its two forms with one and two arguments +# +@param,forward_tag_delimiter,0 +@param,reverse_tag_delimiter,0 +# +# Three algorithms are available to math a pair of tags with a sample. +# It is specified using the @matching parameter. The three possible +# values are strict, hamming, and indel. The default value is strict. +# As for previous parameters, forward_matching and reverse_matching can +# be used to specify the matching value for each primer. And spacer +# can be used with two arguments to specify the matching value for +# a specific primer. +# +@param,matching,strict +# +# The primer_mismatches parameter allows to specify the number of errors allowed +# when matching the primer. The default value is 2. The same declination of +# the parameters forward_primer_mismatches and reverse_primer_mismatches exist. +# +@param,primer_mismatches,2 +# +# The @indel parameter allows to specify if indel are allowed during the matching +# of the primers to the sequence. The default value is false. forward_indel and +# reverse_indel can be used to specify the value for each primer. +# +@param,indels,false +# +### +### Description of the PCR multiplexed +### +# +# Below is an example for the minimal description of the PCRs multiplexed in the +# sequencing library. +# +# The first line is the column names and must exist. +# Five columns are expected : +# - experiment: the experiment name, that allows for grouping samples +# - sample: the sample (pcr) name +# - sample_tag: the tag identifying the sample +# The sample tag must be unique in the library for a given pair of primers +# + They can be a simple DNA word as here. This means that the same tag is used +# for both primers. +# + It can be two DNA words separated by a colon. For example, aagtag:gaagtag. +# This means that the first tag is used for the forward primer and the second for the +# reverse primers. "aagtag" is the same as "aagtag:aagtag". +# + In the two word syntax, if a primer forward or reverse is not tagged, its tag +# is replaced by a hyphen '-', for example 'aagtag:-' or '-:aagtag'. +# For a given primer all the tags must have the same length. +# - forward_primer: the forward primer sequence +# - reverse_primer: the reverse primer sequence +# +experiment,sample,sample_tag,forward_primer,reverse_primer wolf_diet,13a_F730603,aattaac,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG -wolf_diet,15a_F730814,gaagtag:gaatatc,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG -wolf_diet,26a_F040644,gaatatc:-,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG -wolf_diet,29a_F260619,-:-,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG +wolf_diet,15a_F730814,gaagtag,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG +wolf_diet,26a_F040644,gaatatc,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG +wolf_diet,29a_F260619,gcctcct,TTAGATACCCCACTATGC,TAGAACAGGCTCCTCTAG ` } diff --git a/pkg/obitools/obimultiplex2/demultiplex.go b/pkg/obitools/obimultiplex2/demultiplex.go deleted file mode 100644 index 30de94d..0000000 --- a/pkg/obitools/obimultiplex2/demultiplex.go +++ /dev/null @@ -1,61 +0,0 @@ -package obimultiplex2 - -import ( - log "github.com/sirupsen/logrus" - - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obingslibrary" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" -) - -func IExtractBarcode(iterator obiiter.IBioSequence) (obiiter.IBioSequence, error) { - - opts := make([]obingslibrary.WithOption, 0, 10) - - opts = append(opts, - obingslibrary.OptionAllowedMismatches(CLIAllowedMismatch()), - obingslibrary.OptionAllowedIndel(CLIAllowsIndel()), - obingslibrary.OptionUnidentified(CLIUnidentifiedFileName()), - obingslibrary.OptionDiscardErrors(!CLIConservedErrors()), - obingslibrary.OptionParallelWorkers(obioptions.CLIParallelWorkers()), - obingslibrary.OptionBatchSize(obioptions.CLIBatchSize()), - ) - - ngsfilter, err := CLINGSFIlter() - if err != nil { - log.Fatalf("%v", err) - } - - worker := ngsfilter.ExtractMultiBarcodeSliceWorker(opts...) - - newIter := iterator.MakeISliceWorker(worker, false) - out := newIter - - if !CLIConservedErrors() { - log.Infoln("Discards unassigned sequences") - out = out.FilterOn(obiseq.HasAttribute("demultiplex_error").Not(), obioptions.CLIBatchSize()) - } - - var unidentified obiiter.IBioSequence - if CLIUnidentifiedFileName() != "" { - log.Printf("Unassigned sequences saved in file: %s\n", CLIUnidentifiedFileName()) - unidentified, out = newIter.DivideOn(obiseq.HasAttribute("demultiplex_error"), - obioptions.CLIBatchSize()) - - go func() { - _, err := obiconvert.CLIWriteBioSequences(unidentified, - true, - CLIUnidentifiedFileName()) - - if err != nil { - log.Fatalf("%v", err) - } - }() - - } - log.Printf("Sequence demultiplexing using %d workers\n", obioptions.CLIParallelWorkers()) - - return out, nil -}