From 31bfc88eb910dc16f55168b8dd5d785e1fe65c08 Mon Sep 17 00:00:00 2001 From: Eric Coissac Date: Tue, 13 Aug 2024 09:45:28 +0200 Subject: [PATCH] Patch a bug on writing to stdout, and add clearer error on openning data files --- cmd/obitools/obiannotate/main.go | 8 +- cmd/obitools/obiclean/main.go | 8 +- cmd/obitools/obicleandb/main.go | 8 +- cmd/obitools/obicomplement/main.go | 8 +- cmd/obitools/obiconsensus/main.go | 8 +- cmd/obitools/obiconvert/main.go | 8 +- cmd/obitools/obicount/main.go | 8 +- cmd/obitools/obicsv/main.go | 8 +- cmd/obitools/obidemerge/main.go | 8 +- cmd/obitools/obidistribute/main.go | 8 +- cmd/obitools/obigrep/main.go | 7 +- cmd/obitools/obijoin/main.go | 8 +- cmd/obitools/obikmermatch/main.go | 50 +++ cmd/obitools/obikmersimcount/main.go | 58 +++ cmd/obitools/obilandmark/main.go | 8 +- cmd/obitools/obimatrix/main.go | 8 +- cmd/obitools/obimicrosat/main.go | 7 +- cmd/obitools/obimultiplex/main.go | 5 +- cmd/obitools/obipcr/main.go | 8 +- cmd/obitools/obireffamidx/main.go | 8 +- cmd/obitools/obirefidx/main.go | 7 +- cmd/obitools/obiscript/main.go | 8 +- cmd/obitools/obisplit/main.go | 8 +- cmd/obitools/obisummary/main.go | 7 +- cmd/obitools/obitag/main.go | 6 +- cmd/obitools/obitag2/main.go | 6 +- cmd/obitools/obiuniq/main.go | 8 +- pkg/obialign/readalign.go | 144 +++++++ pkg/obiformats/fastseq_write_fasta.go | 3 +- pkg/obiformats/fastseq_write_fastq.go | 5 +- pkg/obiformats/seqfile_chunk_write.go | 13 +- pkg/obiformats/universal_write.go | 3 +- pkg/obifp/uint128.go | 289 +++++++++++++ pkg/obifp/uint256.go | 307 ++++++++++++++ pkg/obifp/uint64.go | 231 +++++++++++ pkg/obifp/unint.go | 41 ++ pkg/obiiter/batchiterator.go | 14 + pkg/obikmer/kmermap.go | 218 ++++++---- pkg/obioptions/version.go | 2 +- pkg/obitools/obiconvert/sequence_reader.go | 14 + pkg/obitools/obikmersim/obikmersim.go | 172 ++++++++ pkg/obitools/obikmersim/options.go | 140 +++++++ pkg/obiutils/uint128.go | 457 --------------------- 43 files changed, 1654 insertions(+), 696 deletions(-) create mode 100644 cmd/obitools/obikmermatch/main.go create mode 100644 cmd/obitools/obikmersimcount/main.go create mode 100644 pkg/obialign/readalign.go create mode 100644 pkg/obifp/uint128.go create mode 100644 pkg/obifp/uint256.go create mode 100644 pkg/obifp/uint64.go create mode 100644 pkg/obifp/unint.go create mode 100644 pkg/obitools/obikmersim/obikmersim.go create mode 100644 pkg/obitools/obikmersim/options.go delete mode 100644 pkg/obiutils/uint128.go diff --git a/cmd/obitools/obiannotate/main.go b/cmd/obitools/obiannotate/main.go index e9ed979..8e9076a 100644 --- a/cmd/obitools/obiannotate/main.go +++ b/cmd/obitools/obiannotate/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" @@ -37,11 +35,7 @@ func main() { _, args := optionParser(os.Args) sequences, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) annotator := obiannotate.CLIAnnotationPipeline() obiconvert.CLIWriteBioSequences(sequences.Pipe(annotator), true) diff --git a/cmd/obitools/obiclean/main.go b/cmd/obitools/obiclean/main.go index 8a29321..db81f4e 100644 --- a/cmd/obitools/obiclean/main.go +++ b/cmd/obitools/obiclean/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiclean" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -18,11 +16,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) cleaned := obiclean.CLIOBIClean(fs) diff --git a/cmd/obitools/obicleandb/main.go b/cmd/obitools/obicleandb/main.go index 94ba435..ba7ea6b 100644 --- a/cmd/obitools/obicleandb/main.go +++ b/cmd/obitools/obicleandb/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicleandb" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -20,11 +18,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) cleaned := obicleandb.ICleanDB(fs) diff --git a/cmd/obitools/obicomplement/main.go b/cmd/obitools/obicomplement/main.go index 252bfb5..883e305 100644 --- a/cmd/obitools/obicomplement/main.go +++ b/cmd/obitools/obicomplement/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -18,11 +16,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) comp := fs.MakeIWorker(obiseq.ReverseComplementWorker(true), true) obiconvert.CLIWriteBioSequences(comp, true) diff --git a/cmd/obitools/obiconsensus/main.go b/cmd/obitools/obiconsensus/main.go index 2d30d5e..2ba9827 100644 --- a/cmd/obitools/obiconsensus/main.go +++ b/cmd/obitools/obiconsensus/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconsensus" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -18,11 +16,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) cleaned := obiconsensus.CLIOBIMinion(fs) diff --git a/cmd/obitools/obiconvert/main.go b/cmd/obitools/obiconvert/main.go index 1081bc0..fdad6a8 100644 --- a/cmd/obitools/obiconvert/main.go +++ b/cmd/obitools/obiconvert/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -20,11 +18,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) obiconvert.CLIWriteBioSequences(fs, true) diff --git a/cmd/obitools/obicount/main.go b/cmd/obitools/obicount/main.go index 750a4be..82c55e7 100644 --- a/cmd/obitools/obicount/main.go +++ b/cmd/obitools/obicount/main.go @@ -4,8 +4,6 @@ import ( "fmt" "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicount" @@ -37,11 +35,7 @@ func main() { obioptions.SetStrictReadWorker(min(4, obioptions.CLIParallelWorkers())) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) nvariant, nread, nsymbol := fs.Count(true) diff --git a/cmd/obitools/obicsv/main.go b/cmd/obitools/obicsv/main.go index 529a391..0ff041a 100644 --- a/cmd/obitools/obicsv/main.go +++ b/cmd/obitools/obicsv/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -17,11 +15,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) obicsv.CLIWriteCSV(fs, true) diff --git a/cmd/obitools/obidemerge/main.go b/cmd/obitools/obidemerge/main.go index 7746be0..dd4c398 100644 --- a/cmd/obitools/obidemerge/main.go +++ b/cmd/obitools/obidemerge/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obidemerge" @@ -21,11 +19,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) demerged := obidemerge.CLIDemergeSequences(fs) diff --git a/cmd/obitools/obidistribute/main.go b/cmd/obitools/obidistribute/main.go index 0378168..c9ef90e 100644 --- a/cmd/obitools/obidistribute/main.go +++ b/cmd/obitools/obidistribute/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obidistribute" @@ -18,11 +16,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) obidistribute.CLIDistributeSequence(fs) diff --git a/cmd/obitools/obigrep/main.go b/cmd/obitools/obigrep/main.go index a5e63cb..7477d80 100644 --- a/cmd/obitools/obigrep/main.go +++ b/cmd/obitools/obigrep/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" @@ -37,11 +35,8 @@ func main() { _, args := optionParser(os.Args) sequences, err := obiconvert.CLIReadBioSequences(args...) + obiconvert.OpenSequenceDataErrorMessage(args, err) - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } selected := obigrep.CLIFilterSequence(sequences) obiconvert.CLIWriteBioSequences(selected, true) obiiter.WaitForLastPipe() diff --git a/cmd/obitools/obijoin/main.go b/cmd/obitools/obijoin/main.go index c83442c..eda016d 100644 --- a/cmd/obitools/obijoin/main.go +++ b/cmd/obitools/obijoin/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obijoin" @@ -21,11 +19,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) joined := obijoin.CLIJoinSequences(fs) diff --git a/cmd/obitools/obikmermatch/main.go b/cmd/obitools/obikmermatch/main.go new file mode 100644 index 0000000..4e286fd --- /dev/null +++ b/cmd/obitools/obikmermatch/main.go @@ -0,0 +1,50 @@ +package main + +import ( + "os" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obikmersim" +) + +func main() { + + defer obiseq.LogBioSeqStatus() + + // go tool pprof -http=":8000" ./obipairing ./cpu.pprof + // f, err := os.Create("cpu.pprof") + // if err != nil { + // log.Fatal(err) + // } + // pprof.StartCPUProfile(f) + // defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() + + optionParser := obioptions.GenerateOptionParser(obikmersim.MatchOptionSet) + + _, args := optionParser(os.Args) + + var err error + sequences := obiiter.NilIBioSequence + + if !obikmersim.CLISelf() { + sequences, err = obiconvert.CLIReadBioSequences(args...) + } + + obiconvert.OpenSequenceDataErrorMessage(args, err) + + selected := obikmersim.CLIAlignSequences(sequences) + obiconvert.CLIWriteBioSequences(selected, true) + obiiter.WaitForLastPipe() + +} diff --git a/cmd/obitools/obikmersimcount/main.go b/cmd/obitools/obikmersimcount/main.go new file mode 100644 index 0000000..a0496c1 --- /dev/null +++ b/cmd/obitools/obikmersimcount/main.go @@ -0,0 +1,58 @@ +package main + +import ( + "log" + "os" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obikmersim" +) + +func main() { + + defer obiseq.LogBioSeqStatus() + + // go tool pprof -http=":8000" ./obipairing ./cpu.pprof + // f, err := os.Create("cpu.pprof") + // if err != nil { + // log.Fatal(err) + // } + // pprof.StartCPUProfile(f) + // defer pprof.StopCPUProfile() + + // go tool trace cpu.trace + // ftrace, err := os.Create("cpu.trace") + // if err != nil { + // log.Fatal(err) + // } + // trace.Start(ftrace) + // defer trace.Stop() + + optionParser := obioptions.GenerateOptionParser(obikmersim.CountOptionSet) + + _, args := optionParser(os.Args) + + var err error + sequences := obiiter.NilIBioSequence + + if !obikmersim.CLISelf() { + sequences, err = obiconvert.CLIReadBioSequences(args...) + } + + obiconvert.OpenSequenceDataErrorMessage(args, err) + + selected := obikmersim.CLILookForSharedKmers(sequences) + topull, err := obiconvert.CLIWriteBioSequences(selected, false) + + if err == nil { + log.Panic(err) + } + + topull.Consume() + + obiiter.WaitForLastPipe() + +} diff --git a/cmd/obitools/obilandmark/main.go b/cmd/obitools/obilandmark/main.go index aae7854..974041a 100644 --- a/cmd/obitools/obilandmark/main.go +++ b/cmd/obitools/obilandmark/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obilandmark" @@ -18,11 +16,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) indexed := obilandmark.CLISelectLandmarkSequences(fs) diff --git a/cmd/obitools/obimatrix/main.go b/cmd/obitools/obimatrix/main.go index e5de5ff..1c7d135 100644 --- a/cmd/obitools/obimatrix/main.go +++ b/cmd/obitools/obimatrix/main.go @@ -4,8 +4,6 @@ import ( "fmt" "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -39,11 +37,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) matrix := obimatrix.IMatrix(fs) diff --git a/cmd/obitools/obimicrosat/main.go b/cmd/obitools/obimicrosat/main.go index 5035ff0..24849b5 100644 --- a/cmd/obitools/obimicrosat/main.go +++ b/cmd/obitools/obimicrosat/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" @@ -37,11 +35,8 @@ func main() { _, args := optionParser(os.Args) sequences, err := obiconvert.CLIReadBioSequences(args...) + obiconvert.OpenSequenceDataErrorMessage(args, err) - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } selected := obimicrosat.CLIAnnotateMicrosat(sequences) obiconvert.CLIWriteBioSequences(selected, true) obiiter.WaitForLastPipe() diff --git a/cmd/obitools/obimultiplex/main.go b/cmd/obitools/obimultiplex/main.go index bea5bfb..b460e30 100644 --- a/cmd/obitools/obimultiplex/main.go +++ b/cmd/obitools/obimultiplex/main.go @@ -43,11 +43,8 @@ func main() { } sequences, err := obiconvert.CLIReadBioSequences(args...) + obiconvert.OpenSequenceDataErrorMessage(args, err) - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } amplicons, _ := obimultiplex.IExtractBarcode(sequences) obiconvert.CLIWriteBioSequences(amplicons, true) amplicons.Wait() diff --git a/cmd/obitools/obipcr/main.go b/cmd/obitools/obipcr/main.go index 8d255da..1eb864e 100644 --- a/cmd/obitools/obipcr/main.go +++ b/cmd/obitools/obipcr/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" @@ -35,11 +33,7 @@ func main() { _, args := optionParser(os.Args) sequences, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) amplicons, _ := obipcr.CLIPCR(sequences) obiconvert.CLIWriteBioSequences(amplicons, true) diff --git a/cmd/obitools/obireffamidx/main.go b/cmd/obitools/obireffamidx/main.go index dea18c4..5377f32 100644 --- a/cmd/obitools/obireffamidx/main.go +++ b/cmd/obitools/obireffamidx/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obirefidx" @@ -18,11 +16,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) indexed := obirefidx.IndexFamilyDB(fs) diff --git a/cmd/obitools/obirefidx/main.go b/cmd/obitools/obirefidx/main.go index 9652eb9..65eb5b8 100644 --- a/cmd/obitools/obirefidx/main.go +++ b/cmd/obitools/obirefidx/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obirefidx" @@ -18,11 +16,8 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) + obiconvert.OpenSequenceDataErrorMessage(args, err) - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } indexed := obirefidx.IndexReferenceDB(fs) obiconvert.CLIWriteBioSequences(indexed, true) diff --git a/cmd/obitools/obiscript/main.go b/cmd/obitools/obiscript/main.go index 224c947..453b130 100644 --- a/cmd/obitools/obiscript/main.go +++ b/cmd/obitools/obiscript/main.go @@ -4,8 +4,6 @@ import ( "fmt" "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" @@ -43,11 +41,7 @@ func main() { } sequences, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) annotator := obiscript.CLIScriptPipeline() obiconvert.CLIWriteBioSequences(sequences.Pipe(annotator), true) diff --git a/cmd/obitools/obisplit/main.go b/cmd/obitools/obisplit/main.go index 95a402b..44e8b17 100644 --- a/cmd/obitools/obisplit/main.go +++ b/cmd/obitools/obisplit/main.go @@ -4,8 +4,6 @@ import ( "fmt" "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" @@ -43,11 +41,7 @@ func main() { } sequences, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) annotator := obisplit.CLISlitPipeline() obiconvert.CLIWriteBioSequences(sequences.Pipe(annotator), true) diff --git a/cmd/obitools/obisummary/main.go b/cmd/obitools/obisummary/main.go index dd69026..73ecde9 100644 --- a/cmd/obitools/obisummary/main.go +++ b/cmd/obitools/obisummary/main.go @@ -5,7 +5,6 @@ import ( "fmt" "os" - log "github.com/sirupsen/logrus" "gopkg.in/yaml.v3" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" @@ -39,11 +38,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) summary := obisummary.ISummary(fs, obisummary.CLIMapSummary()) diff --git a/cmd/obitools/obitag/main.go b/cmd/obitools/obitag/main.go index f48fb72..0f4add0 100644 --- a/cmd/obitools/obitag/main.go +++ b/cmd/obitools/obitag/main.go @@ -42,11 +42,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) taxo, error := obifind.CLILoadSelectedTaxonomy() if error != nil { diff --git a/cmd/obitools/obitag2/main.go b/cmd/obitools/obitag2/main.go index 9a4c7bc..2d82720 100644 --- a/cmd/obitools/obitag2/main.go +++ b/cmd/obitools/obitag2/main.go @@ -43,11 +43,7 @@ func main() { _, args := optionParser(os.Args) fs, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) taxo, error := obifind.CLILoadSelectedTaxonomy() if error != nil { diff --git a/cmd/obitools/obiuniq/main.go b/cmd/obitools/obiuniq/main.go index b3dcf1a..959cf91 100644 --- a/cmd/obitools/obiuniq/main.go +++ b/cmd/obitools/obiuniq/main.go @@ -3,8 +3,6 @@ package main import ( "os" - log "github.com/sirupsen/logrus" - "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" @@ -37,11 +35,7 @@ func main() { _, args := optionParser(os.Args) sequences, err := obiconvert.CLIReadBioSequences(args...) - - if err != nil { - log.Errorf("Cannot open file (%v)", err) - os.Exit(1) - } + obiconvert.OpenSequenceDataErrorMessage(args, err) unique := obiuniq.CLIUnique(sequences) obiconvert.CLIWriteBioSequences(unique, true) diff --git a/pkg/obialign/readalign.go b/pkg/obialign/readalign.go new file mode 100644 index 0000000..10a2e65 --- /dev/null +++ b/pkg/obialign/readalign.go @@ -0,0 +1,144 @@ +package obialign + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +func ReadAlign(seqA, seqB *obiseq.BioSequence, + gap, scale float64, delta int, fastScoreRel bool, + arena PEAlignArena, shift_buff *map[int]int) (int, []int, int, int, float64, bool) { + var score, shift int + var startA, startB int + var partLen, over int + var rawSeqA, qualSeqA []byte + var rawSeqB, qualSeqB []byte + var extra5, extra3 int + + var path []int + + if !_InitializedDnaScore { + _InitDNAScoreMatrix() + } + + fastCount := -1 + fastScore := -1.0 + + directAlignment := true + + index := obikmer.Index4mer(seqA, + &arena.pointer.fastIndex, + &arena.pointer.fastBuffer) + + shift, fastCount, fastScore = obikmer.FastShiftFourMer(index, shift_buff, seqA.Len(), seqB, fastScoreRel, nil) + + seqBR := seqB.ReverseComplement(false) + shiftR, fastCountR, fastScoreR := obikmer.FastShiftFourMer(index, shift_buff, seqA.Len(), seqBR, fastScoreRel, nil) + + if fastCount < fastCountR { + shift = shiftR + fastCount = fastCountR + fastScore = fastScoreR + seqB = seqBR + directAlignment = false + } + + if shift > 0 { + over = seqA.Len() - shift + } else { + over = seqB.Len() + shift + } + + // At least one mismatch exists in the overlaping region + if fastCount+3 < over { + + if shift > 0 { + startA = shift - delta + if startA < 0 { + startA = 0 + } + extra5 = -startA + startB = 0 + + rawSeqA = seqA.Sequence()[startA:] + qualSeqA = seqA.Qualities()[startA:] + partLen = len(rawSeqA) + if partLen > seqB.Len() { + partLen = seqB.Len() + } + rawSeqB = seqB.Sequence()[0:partLen] + qualSeqB = seqB.Qualities()[0:partLen] + extra3 = seqB.Len() - partLen + score = _FillMatrixPeLeftAlign( + rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, scale, + &arena.pointer.scoreMatrix, + &arena.pointer.pathMatrix) + } else { + + startA = 0 + startB = -shift - delta + if startB < 0 { + startB = 0 + } + extra5 = startB + rawSeqB = seqB.Sequence()[startB:] + qualSeqB = seqB.Qualities()[startB:] + partLen = len(rawSeqB) + if partLen > seqA.Len() { + partLen = seqA.Len() + } + rawSeqA = seqA.Sequence()[:partLen] + qualSeqA = seqA.Qualities()[:partLen] + extra3 = partLen - seqA.Len() + + score = _FillMatrixPeRightAlign( + rawSeqA, qualSeqA, rawSeqB, qualSeqB, gap, scale, + &arena.pointer.scoreMatrix, + &arena.pointer.pathMatrix) + } + + path = _Backtracking(arena.pointer.pathMatrix, + len(rawSeqA), len(rawSeqB), + &arena.pointer.path) + + } else { + + // Both overlaping regions are identicals + + if shift > 0 { + startA = shift + startB = 0 + extra5 = -startA + qualSeqA = seqA.Qualities()[startA:] + partLen = len(qualSeqA) + qualSeqB = seqB.Qualities()[0:partLen] + extra3 = seqB.Len() - partLen + score = 0 + } else { + startA = 0 + startB = -shift + extra5 = startB + qualSeqB = seqB.Qualities()[startB:] + partLen = len(qualSeqB) + extra3 = partLen - seqA.Len() + qualSeqA = seqA.Qualities()[:partLen] + } + score = 0 + for i, qualA := range qualSeqA { + qualB := qualSeqB[i] + score += _NucScorePartMatchMatch[qualA][qualB] + } + + path = arena.pointer.path[:0] + path = append(path, 0, partLen) + } + + path[0] += extra5 + if path[len(path)-1] == 0 { + path[len(path)-2] += extra3 + } else { + path = append(path, extra3, 0) + } + + return score, path, fastCount, over, fastScore, directAlignment +} diff --git a/pkg/obiformats/fastseq_write_fasta.go b/pkg/obiformats/fastseq_write_fasta.go index 2f0d9ec..4c9f7ef 100644 --- a/pkg/obiformats/fastseq_write_fasta.go +++ b/pkg/obiformats/fastseq_write_fasta.go @@ -183,7 +183,8 @@ func WriteFasta(iterator obiiter.IBioSequence, // The function returns the same bio sequence iterator and an error if any occurred. func WriteFastaToStdout(iterator obiiter.IBioSequence, options ...WithOption) (obiiter.IBioSequence, error) { - options = append(options, OptionDontCloseFile()) + // options = append(options, OptionDontCloseFile()) + options = append(options, OptionCloseFile()) return WriteFasta(iterator, os.Stdout, options...) } diff --git a/pkg/obiformats/fastseq_write_fastq.go b/pkg/obiformats/fastseq_write_fastq.go index c79ad5a..e3959c1 100644 --- a/pkg/obiformats/fastseq_write_fastq.go +++ b/pkg/obiformats/fastseq_write_fastq.go @@ -97,7 +97,6 @@ func WriteFastq(iterator obiiter.IBioSequence, options ...WithOption) (obiiter.IBioSequence, error) { opt := MakeOptions(options) - iterator = iterator file, _ = obiutils.CompressStream(file, opt.CompressedFile(), opt.CloseFile()) @@ -147,7 +146,9 @@ func WriteFastq(iterator obiiter.IBioSequence, func WriteFastqToStdout(iterator obiiter.IBioSequence, options ...WithOption) (obiiter.IBioSequence, error) { - options = append(options, OptionDontCloseFile()) + // options = append(options, OptionDontCloseFile()) + options = append(options, OptionCloseFile()) + return WriteFastq(iterator, os.Stdout, options...) } diff --git a/pkg/obiformats/seqfile_chunk_write.go b/pkg/obiformats/seqfile_chunk_write.go index 5752788..57beb4a 100644 --- a/pkg/obiformats/seqfile_chunk_write.go +++ b/pkg/obiformats/seqfile_chunk_write.go @@ -21,11 +21,21 @@ func WriteSeqFileChunk( toBePrinted := make(map[int]SeqFileChunk) for chunk := range chunk_channel { if chunk.Order == nextToPrint { - _, _ = writer.Write(chunk.Raw.Bytes()) + log.Debugf("Writing chunk: %d of length %d bytes", + chunk.Order, + len(chunk.Raw.Bytes())) + + n, err := writer.Write(chunk.Raw.Bytes()) + + if err != nil { + log.Fatalf("Cannot write chunk %d only %d bytes written on %d sended : %v", + chunk.Order, n, len(chunk.Raw.Bytes()), err) + } nextToPrint++ chunk, ok := toBePrinted[nextToPrint] for ok { + log.Debug("Writing buffered chunk : ", chunk.Order) _, _ = writer.Write(chunk.Raw.Bytes()) delete(toBePrinted, nextToPrint) nextToPrint++ @@ -36,6 +46,7 @@ func WriteSeqFileChunk( } } + log.Debugf("FIle have to be closed : %v", toBeClosed) if toBeClosed { err := writer.Close() if err != nil { diff --git a/pkg/obiformats/universal_write.go b/pkg/obiformats/universal_write.go index 6d8da4a..2f3061f 100644 --- a/pkg/obiformats/universal_write.go +++ b/pkg/obiformats/universal_write.go @@ -45,7 +45,8 @@ func WriteSequence(iterator obiiter.IBioSequence, func WriteSequencesToStdout(iterator obiiter.IBioSequence, options ...WithOption) (obiiter.IBioSequence, error) { - options = append(options, OptionDontCloseFile()) + // options = append(options, OptionDontCloseFile()) + options = append(options, OptionCloseFile()) return WriteSequence(iterator, os.Stdout, options...) } diff --git a/pkg/obifp/uint128.go b/pkg/obifp/uint128.go new file mode 100644 index 0000000..0ecdfc5 --- /dev/null +++ b/pkg/obifp/uint128.go @@ -0,0 +1,289 @@ +package obifp + +import ( + "math" + "math/bits" + + log "github.com/sirupsen/logrus" +) + +type Uint128 struct { + w1 uint64 + w0 uint64 +} + +// Zero returns a zero-valued uint128. +// +// No parameters. +// Returns a Uint128 value. +func (u Uint128) Zero() Uint128 { + return Uint128{w1: 0, w0: 0} +} + +// MaxValue returns the maximum possible value for a Uint128. +// +// It returns a Uint128 value with the highest possible values for high and low fields. +func (u Uint128) MaxValue() Uint128 { + return Uint128{ + w1: math.MaxUint64, + w0: math.MaxUint64, + } + +} + +// IsZero checks if the Uint128 value is zero. +// +// It returns a boolean indicating whether the Uint128 value is zero. +func (u Uint128) IsZero() bool { + return u.w0 == 0 && u.w1 == 0 +} + +// Cast a Uint128 to a Uint64. +// +// A Warning will be logged if an overflow occurs. +// +// No parameters. +// Returns a Uint64 value. +func (u Uint128) Uint64() Uint64 { + if u.w1 != 0 { + log.Warnf("Uint128 overflow at Uint64(%v)", u) + } + return Uint64{w0: u.w0} +} + +// Uint128 cast a Uint128 to a Uint128. +// +// Which is a no-op. +// +// No parameters. +// Returns a Uint128 value. +func (u Uint128) Uint128() Uint128 { + return u +} + +// Cast a Uint128 to a Uint256. +// +// A Warning will be logged if an overflow occurs. +// +// No parameters. +// Returns a Uint256 value. +func (u Uint128) Uint256() Uint256 { + return Uint256{0, 0, u.w1, u.w0} +} + +func (u Uint128) Set64(v uint64) Uint128 { + + return Uint128{ + w1: 0, + w0: v, + } +} + +// LeftShift performs a left shift operation on the Uint128 value by the specified number of bits. +// +// Parameters: +// - n: the number of bits to shift the Uint128 value to the left. +// +// Returns: +// - Uint128: the result of the left shift operation. +func (u Uint128) LeftShift(n uint) Uint128 { + lo, carry := Uint64{w0: u.w0}.LeftShift64(n, 0) + hi, _ := Uint64{w0: u.w1}.LeftShift64(n, carry) + return Uint128{w1: hi, w0: lo} +} + +// RightShift performs a right shift operation on the Uint128 value by the specified number of bits. +// +// Parameters: +// - n: the number of bits to shift the Uint128 value to the right. +// +// Returns: +// - Uint128: the result of the right shift operation. +func (u Uint128) RightShift(n uint) Uint128 { + hi, carry := Uint64{w0: u.w1}.RightShift64(n, 0) + lo, _ := Uint64{w0: u.w0}.RightShift64(n, carry) + return Uint128{w1: hi, w0: lo} +} + +// Add performs addition of two Uint128 values and returns the result. +// +// Parameters: +// - v: the Uint128 value to add to the receiver. +// +// Returns: +// - Uint128: the result of the addition. +func (u Uint128) Add(v Uint128) Uint128 { + lo, carry := bits.Add64(u.w0, v.w0, 0) + hi, carry := bits.Add64(u.w1, v.w1, carry) + if carry != 0 { + log.Panicf("Uint128 overflow at Add(%v, %v)", u, v) + } + return Uint128{w1: hi, w0: lo} +} + +func (u Uint128) Add64(v uint64) Uint128 { + lo, carry := bits.Add64(u.w0, v, 0) + hi, carry := bits.Add64(u.w1, 0, carry) + if carry != 0 { + log.Panicf("Uint128 overflow at Add64(%v, %v)", u, v) + } + return Uint128{w1: hi, w0: lo} +} + +func (u Uint128) Sub(v Uint128) Uint128 { + lo, borrow := bits.Sub64(u.w0, v.w0, 0) + hi, borrow := bits.Sub64(u.w1, v.w1, borrow) + if borrow != 0 { + log.Panicf("Uint128 underflow at Sub(%v, %v)", u, v) + } + return Uint128{w1: hi, w0: lo} +} + +func (u Uint128) Mul(v Uint128) Uint128 { + hi, lo := bits.Mul64(u.w0, v.w0) + p0, p1 := bits.Mul64(u.w1, v.w0) + p2, p3 := bits.Mul64(u.w0, v.w1) + hi, c0 := bits.Add64(hi, p1, 0) + hi, c1 := bits.Add64(hi, p3, c0) + if (u.w1 != 0 && v.w1 != 0) || p0 != 0 || p2 != 0 || c1 != 0 { + log.Panicf("Uint128 overflow at Mul(%v, %v)", u, v) + } + return Uint128{w1: hi, w0: lo} +} + +func (u Uint128) Mul64(v uint64) Uint128 { + hi, lo := bits.Mul64(u.w0, v) + p0, p1 := bits.Mul64(u.w1, v) + hi, c0 := bits.Add64(hi, p1, 0) + if p0 != 0 || c0 != 0 { + log.Panicf("Uint128 overflow at Mul64(%v, %v)", u, v) + } + return Uint128{w1: hi, w0: lo} +} + +func (u Uint128) QuoRem(v Uint128) (q, r Uint128) { + if v.w1 == 0 { + var r64 uint64 + q, r64 = u.QuoRem64(v.w0) + r = Uint128{w1: 0, w0: r64} + } else { + // generate a "trial quotient," guaranteed to be within 1 of the actual + // quotient, then adjust. + n := uint(bits.LeadingZeros64(v.w1)) + v1 := v.LeftShift(n) + u1 := u.RightShift(1) + tq, _ := bits.Div64(u1.w1, u1.w0, v1.w1) + tq >>= 63 - n + if tq != 0 { + tq-- + } + q = Uint128{w1: 0, w0: tq} + // calculate remainder using trial quotient, then adjust if remainder is + // greater than divisor + r = u.Sub(v.Mul64(tq)) + if r.Cmp(v) >= 0 { + q = q.Add64(1) + r = r.Sub(v) + } + } + return +} + +// QuoRem64 returns q = u/v and r = u%v. +func (u Uint128) QuoRem64(v uint64) (q Uint128, r uint64) { + if u.w1 < v { + q.w0, r = bits.Div64(u.w1, u.w0, v) + } else { + q.w1, r = bits.Div64(0, u.w1, v) + q.w0, r = bits.Div64(r, u.w0, v) + } + return +} + +func (u Uint128) Div(v Uint128) Uint128 { + q, _ := u.QuoRem(v) + return q +} + +func (u Uint128) Div64(v uint64) Uint128 { + q, _ := u.QuoRem64(v) + return q +} + +func (u Uint128) Mod(v Uint128) Uint128 { + _, r := u.QuoRem(v) + return r +} + +func (u Uint128) Mod64(v uint64) uint64 { + _, r := u.QuoRem64(v) + return r +} + +func (u Uint128) Cmp(v Uint128) int { + switch { + case u.w1 > v.w1: + return 1 + case u.w1 < v.w1: + return -1 + case u.w0 > v.w0: + return 1 + case u.w0 < v.w0: + return -1 + default: + return 0 + } +} + +func (u Uint128) Cmp64(v uint64) int { + switch { + case u.w1 > 0: + return 1 + case u.w0 > v: + return 1 + case u.w0 < v: + return -1 + default: + return 0 + } +} + +func (u Uint128) Equals(v Uint128) bool { + return u.Cmp(v) == 0 +} + +func (u Uint128) LessThan(v Uint128) bool { + return u.Cmp(v) < 0 +} + +func (u Uint128) GreaterThan(v Uint128) bool { + return u.Cmp(v) > 0 +} + +func (u Uint128) LessThanOrEqual(v Uint128) bool { + return !u.GreaterThan(v) +} + +func (u Uint128) GreaterThanOrEqual(v Uint128) bool { + return !u.LessThan(v) +} + +func (u Uint128) And(v Uint128) Uint128 { + return Uint128{w1: u.w1 & v.w1, w0: u.w0 & v.w0} +} + +func (u Uint128) Or(v Uint128) Uint128 { + return Uint128{w1: u.w1 | v.w1, w0: u.w0 | v.w0} +} + +func (u Uint128) Xor(v Uint128) Uint128 { + return Uint128{w1: u.w1 ^ v.w1, w0: u.w0 ^ v.w0} +} + +func (u Uint128) Not() Uint128 { + return Uint128{w1: ^u.w1, w0: ^u.w0} +} + +func (u Uint128) AsUint64() uint64 { + return u.w0 +} diff --git a/pkg/obifp/uint256.go b/pkg/obifp/uint256.go new file mode 100644 index 0000000..c685d4d --- /dev/null +++ b/pkg/obifp/uint256.go @@ -0,0 +1,307 @@ +package obifp + +import ( + "math" + "math/bits" + + log "github.com/sirupsen/logrus" +) + +type Uint256 struct { + w3 uint64 + w2 uint64 + w1 uint64 + w0 uint64 +} + +// Zero returns a zero value of type Uint256. +// +// No parameters. +// Returns a Uint256 value of 0. +func (u Uint256) Zero() Uint256 { + return Uint256{} +} + +// MaxValue returns the maximum possible value of type Uint256. +// +// No parameters. +// Returns the maximum value of type Uint256. +func (u Uint256) MaxValue() Uint256 { + return Uint256{ + w3: math.MaxUint64, + w2: math.MaxUint64, + w1: math.MaxUint64, + w0: math.MaxUint64, + } +} + +// IsZero checks if the Uint256 value is zero. +// +// No parameters. +// Returns a boolean indicating if the value is zero. +func (u Uint256) IsZero() bool { + return u == Uint256{} +} + +// Cast a Uint256 to a Uint64. +// +// A Warning will be logged if an overflow occurs. +// +// No parameters. +// Returns a Uint64 value. +func (u Uint256) Uint64() Uint64 { + if u.w3 != 0 || u.w2 != 0 || u.w1 != 0 { + log.Warnf("Uint256 overflow at Uint64(%v)", u) + } + return Uint64{w0: u.w0} +} + +// Cast a Uint256 to a Uint128. +// +// A Warning will be logged if an overflow occurs. +// +// No parameters. +// Returns a Uint128 value. +func (u Uint256) Uint128() Uint128 { + if u.w3 != 0 || u.w2 != 0 { + log.Warnf("Uint256 overflow at Uint128(%v)", u) + } + return Uint128{u.w1, u.w0} +} + +// Cast a Uint128 to a Uint256. +// +// A Warning will be logged if an overflow occurs. +// +// No parameters. +// Returns a Uint256 value. +func (u Uint256) Uint256() Uint256 { + return u +} + +func (u Uint256) Set64(v uint64) Uint256 { + + return Uint256{ + w3: 0, + w2: 0, + w1: 0, + w0: v, + } +} + +func (u Uint256) LeftShift(n uint) Uint256 { + w0, carry := Uint64{w0: u.w0}.LeftShift64(n, 0) + w1, carry := Uint64{w0: u.w1}.LeftShift64(n, carry) + w2, carry := Uint64{w0: u.w2}.LeftShift64(n, carry) + w3, _ := Uint64{w0: u.w3}.LeftShift64(n, carry) + return Uint256{w3, w2, w1, w0} +} + +func (u Uint256) RightShift(n uint) Uint256 { + w3, carry := Uint64{w0: u.w3}.RightShift64(n, 0) + w2, carry := Uint64{w0: u.w2}.RightShift64(n, carry) + w1, carry := Uint64{w0: u.w1}.RightShift64(n, carry) + w0, _ := Uint64{w0: u.w0}.RightShift64(n, carry) + return Uint256{w3, w2, w1, w0} +} + +func (u Uint256) Cmp(v Uint256) int { + switch { + case u.w3 > v.w3: + return 1 + case u.w3 < v.w3: + return -1 + case u.w2 > v.w2: + return 1 + case u.w2 < v.w2: + return -1 + case u.w1 > v.w1: + return 1 + case u.w1 < v.w1: + return -1 + case u.w0 > v.w0: + return 1 + case u.w0 < v.w0: + return -1 + default: + return 0 + } +} + +// Add performs addition of two Uint256 values and returns the result. +// +// Parameters: +// - v: the Uint256 value to add to the receiver. +// +// Returns: +// - Uint256: the result of the addition. +func (u Uint256) Add(v Uint256) Uint256 { + w0, carry := bits.Add64(u.w0, v.w0, 0) + w1, carry := bits.Add64(u.w1, v.w1, carry) + w2, carry := bits.Add64(u.w2, v.w2, carry) + w3, carry := bits.Add64(u.w3, v.w3, carry) + if carry != 0 { + log.Panicf("Uint256 overflow at Add(%v, %v)", u, v) + } + return Uint256{w3, w2, w1, w0} +} + +// Sub performs subtraction of two Uint256 values and returns the result. +// +// Parameters: +// - v: the Uint256 value to subtract from the receiver. +// +// Returns: +// - Uint256: the result of the subtraction. +func (u Uint256) Sub(v Uint256) Uint256 { + w0, borrow := bits.Sub64(u.w0, v.w0, 0) + w1, borrow := bits.Sub64(u.w1, v.w1, borrow) + w2, borrow := bits.Sub64(u.w2, v.w2, borrow) + w3, borrow := bits.Sub64(u.w3, v.w3, borrow) + if borrow != 0 { + log.Panicf("Uint256 overflow at Sub(%v, %v)", u, v) + } + return Uint256{w3, w2, w1, w0} +} + +// Mul performs multiplication of two Uint256 values and returns the result. +// +// Parameters: +// - v: the Uint256 value to multiply with the receiver. +// +// Returns: +// - Uint256: the result of the multiplication. +func (u Uint256) Mul(v Uint256) Uint256 { + var w0, w1, w2, w3, carry uint64 + + w0Low, w0High := bits.Mul64(u.w0, v.w0) + w1Low1, w1High1 := bits.Mul64(u.w0, v.w1) + w1Low2, w1High2 := bits.Mul64(u.w1, v.w0) + w2Low1, w2High1 := bits.Mul64(u.w0, v.w2) + w2Low2, w2High2 := bits.Mul64(u.w1, v.w1) + w2Low3, w2High3 := bits.Mul64(u.w2, v.w0) + w3Low1, w3High1 := bits.Mul64(u.w0, v.w3) + w3Low2, w3High2 := bits.Mul64(u.w1, v.w2) + w3Low3, w3High3 := bits.Mul64(u.w2, v.w1) + w3Low4, w3High4 := bits.Mul64(u.w3, v.w0) + + w0 = w0Low + + w1, carry = bits.Add64(w1Low1, w1Low2, 0) + w1, _ = bits.Add64(w1, w0High, carry) + + w2, carry = bits.Add64(w2Low1, w2Low2, 0) + w2, carry = bits.Add64(w2, w2Low3, carry) + w2, carry = bits.Add64(w2, w1High1, carry) + w2, _ = bits.Add64(w2, w1High2, carry) + + w3, carry = bits.Add64(w3Low1, w3Low2, 0) + w3, carry = bits.Add64(w3, w3Low3, carry) + w3, carry = bits.Add64(w3, w3Low4, carry) + w3, carry = bits.Add64(w3, w2High1, carry) + w3, carry = bits.Add64(w3, w2High2, carry) + w3, carry = bits.Add64(w3, w2High3, carry) + + if w3High1 != 0 || w3High2 != 0 || w3High3 != 0 || w3High4 != 0 || carry != 0 { + log.Panicf("Uint256 overflow at Mul(%v, %v)", u, v) + } + + return Uint256{w3, w2, w1, w0} +} + +// Div performs division of two Uint256 values and returns the result. +// +// Parameters: +// - v: the Uint256 value to divide with the receiver. +// +// Returns: +// - Uint256: the result of the division. +func (u Uint256) Div(v Uint256) Uint256 { + if v.IsZero() { + log.Panicf("division by zero") + } + + if u.IsZero() || u.LessThan(v) { + return Uint256{} + } + + if v.Equals(Uint256{0, 0, 0, 1}) { + return u // Division by 1 + } + + var q, r Uint256 + r = u + + for r.GreaterThanOrEqual(v) { + var t Uint256 = v + var m Uint256 = Uint256{0, 0, 0, 1} + for t.LeftShift(1).LessThanOrEqual(r) { + t = t.LeftShift(1) + m = m.LeftShift(1) + } + r = r.Sub(t) + q = q.Add(m) + } + + return q +} + +func (u Uint256) Equals(v Uint256) bool { + return u.Cmp(v) == 0 +} + +func (u Uint256) LessThan(v Uint256) bool { + return u.Cmp(v) < 0 +} + +func (u Uint256) GreaterThan(v Uint256) bool { + return u.Cmp(v) > 0 +} + +func (u Uint256) LessThanOrEqual(v Uint256) bool { + return !u.GreaterThan(v) +} + +func (u Uint256) GreaterThanOrEqual(v Uint256) bool { + return !u.LessThan(v) +} + +func (u Uint256) And(v Uint256) Uint256 { + return Uint256{ + w3: u.w3 & v.w3, + w2: u.w2 & v.w2, + w1: u.w1 & v.w1, + w0: u.w0 & v.w0, + } +} + +func (u Uint256) Or(v Uint256) Uint256 { + return Uint256{ + w3: u.w3 | v.w3, + w2: u.w2 | v.w2, + w1: u.w1 | v.w1, + w0: u.w0 | v.w0, + } +} + +func (u Uint256) Xor(v Uint256) Uint256 { + return Uint256{ + w3: u.w3 ^ v.w3, + w2: u.w2 ^ v.w2, + w1: u.w1 ^ v.w1, + w0: u.w0 ^ v.w0, + } +} + +func (u Uint256) Not() Uint256 { + return Uint256{ + w3: ^u.w3, + w2: ^u.w2, + w1: ^u.w1, + w0: ^u.w0, + } +} + +func (u Uint256) AsUint64() uint64 { + return u.w0 +} diff --git a/pkg/obifp/uint64.go b/pkg/obifp/uint64.go new file mode 100644 index 0000000..b74e088 --- /dev/null +++ b/pkg/obifp/uint64.go @@ -0,0 +1,231 @@ +package obifp + +import ( + "math" + "math/bits" + + log "github.com/sirupsen/logrus" +) + +type Uint64 struct { + w0 uint64 +} + +// Zero returns a zero value of type Uint64. +// +// No parameters. +// Returns a Uint64 value of 0. +func (u Uint64) Zero() Uint64 { + return Uint64{0} +} + +// MaxValue returns the maximum possible value of type Uint64. +// +// No parameters. +// Returns the maximum value of type Uint64. +func (u Uint64) MaxValue() Uint64 { + return Uint64{math.MaxUint64} +} + +// IsZero checks if the Uint64 value is zero. +// +// No parameters. +// Returns a boolean indicating if the value is zero. +func (u Uint64) IsZero() bool { + return u.w0 == 0 +} + +// Cast a Uint64 to a Uint64. +// +// Which is a no-op. +// +// No parameters. +// Returns the Uint64 value itself. +func (u Uint64) Uint64() Uint64 { + return u +} + +// Cast a Uint64 to a Uint128. +// +// No parameters. +// Returns a Uint128 value with the high field set to 0 and the low field set to the value of the Uint64. +func (u Uint64) Uint128() Uint128 { + return Uint128{w1: 0, w0: u.w0} +} + +// Cast a Uint64 to a Uint256. +// +// No parameters. +// Returns a Uint256 value with the high fields set to 0 and the low fields set to the value of the Uint64. +func (u Uint64) Uint256() Uint256 { + return Uint256{w3: 0, w2: 0, w1: 0, w0: u.w0} +} + +func (u Uint64) Set64(v uint64) Uint64 { + + return Uint64{ + w0: v, + } +} + +// LeftShift64 performs a left shift operation on the Uint64 value by n bits, with carry-in from carryIn. +// +// The carry-in value is used as the first bit of the shifted value. +// +// The function returns u << n | (carryIn & ((1 << n) - 1)). +// +// This is a shift left operation, lowest bits are set with the lowest bits of +// the carry-in value instead of 0 as they would be in classical a left shift operation. +// +// Parameters: +// - n: the number of bits to shift by. +// - carryIn: the carry-in value. +// +// Returns: +// - value: the result of the left shift operation. +// - carry: the carry-out value. +func (u Uint64) LeftShift64(n uint, carryIn uint64) (value, carry uint64) { + switch { + case n == 0: + return u.w0, 0 + + case n < 64: + return u.w0<> (64 - n) + + case n == 64: + return carryIn, u.w0 + } + + log.Warnf("Uint64 overflow at LeftShift64(%v, %v)", u, n) + return 0, 0 + +} + +// RightShift64 performs a right shift operation on the Uint64 value by n bits, with carry-out to carry. +// +// The function returns the result of the right shift operation and the carry-out value. +// +// Parameters: +// - n: the number of bits to shift by. +// +// Returns: +// - value: the result of the right shift operation. +// - carry: the carry-out value. +func (u Uint64) RightShift64(n uint, carryIn uint64) (value, carry uint64) { + switch { + case n == 0: + return u.w0, 0 + + case n < 64: + return u.w0>>n | (carryIn & ^((1 << (64 - n)) - 1)), u.w0 << (n - 64) + + case n == 64: + return carryIn, u.w0 + } + + log.Warnf("Uint64 overflow at RightShift64(%v, %v)", u, n) + return 0, 0 +} + +func (u Uint64) Add64(v Uint64, carryIn uint64) (value, carry uint64) { + return bits.Add64(u.w0, v.w0, uint64(carryIn)) +} + +func (u Uint64) Sub64(v Uint64, carryIn uint64) (value, carry uint64) { + return bits.Sub64(u.w0, v.w0, uint64(carryIn)) +} + +func (u Uint64) Mul64(v Uint64) (value, carry uint64) { + return bits.Mul64(u.w0, v.w0) +} + +func (u Uint64) LeftShift(n uint) Uint64 { + sl, _ := u.LeftShift64(n, 0) + return Uint64{w0: sl} +} + +func (u Uint64) RightShift(n uint) Uint64 { + sr, _ := u.RightShift64(n, 0) + return Uint64{w0: sr} +} + +func (u Uint64) Add(v Uint64) Uint64 { + value, carry := u.Add64(v, 0) + + if carry != 0 { + log.Panicf("Uint64 overflow at Add(%v, %v)", u, v) + } + + return Uint64{w0: value} +} + +func (u Uint64) Sub(v Uint64) Uint64 { + value, carry := u.Sub64(v, 0) + + if carry != 0 { + log.Panicf("Uint64 overflow at Sub(%v, %v)", u, v) + } + + return Uint64{w0: value} +} + +func (u Uint64) Mul(v Uint64) Uint64 { + value, carry := u.Mul64(v) + + if carry != 0 { + log.Panicf("Uint64 overflow at Mul(%v, %v)", u, v) + } + + return Uint64{w0: value} +} + +func (u Uint64) Cmp(v Uint64) int { + switch { + case u.w0 < v.w0: + return -1 + case u.w0 > v.w0: + return 1 + default: + return 0 + } +} + +func (u Uint64) Equals(v Uint64) bool { + return u.Cmp(v) == 0 +} + +func (u Uint64) LessThan(v Uint64) bool { + return u.Cmp(v) < 0 +} + +func (u Uint64) GreaterThan(v Uint64) bool { + return u.Cmp(v) > 0 +} + +func (u Uint64) LessThanOrEqual(v Uint64) bool { + return !u.GreaterThan(v) +} + +func (u Uint64) GreaterThanOrEqual(v Uint64) bool { + return !u.LessThan(v) +} + +func (u Uint64) And(v Uint64) Uint64 { + return Uint64{w0: u.w0 & v.w0} +} + +func (u Uint64) Or(v Uint64) Uint64 { + return Uint64{w0: u.w0 | v.w0} +} + +func (u Uint64) Xor(v Uint64) Uint64 { + return Uint64{w0: u.w0 ^ v.w0} +} + +func (u Uint64) Not() Uint64 { + return Uint64{w0: ^u.w0} +} + +func (u Uint64) AsUint64() uint64 { + return u.w0 +} diff --git a/pkg/obifp/unint.go b/pkg/obifp/unint.go new file mode 100644 index 0000000..5e01ed2 --- /dev/null +++ b/pkg/obifp/unint.go @@ -0,0 +1,41 @@ +package obifp + +type FPUint[T Uint64 | Uint128 | Uint256] interface { + Zero() T + Set64(v uint64) T + + IsZero() bool + LeftShift(n uint) T + RightShift(n uint) T + + Add(v T) T + Sub(v T) T + Mul(v T) T + //Div(v T) T + + And(v T) T + Or(v T) T + Xor(v T) T + Not() T + + LessThan(v T) bool + LessThanOrEqual(v T) bool + GreaterThan(v T) bool + GreaterThanOrEqual(v T) bool + + AsUint64() uint64 + + Uint64 | Uint128 | Uint256 +} + +func ZeroUint[T FPUint[T]]() T { + return *new(T) +} + +func OneUint[T FPUint[T]]() T { + return ZeroUint[T]().Set64(1) +} + +func From64[T FPUint[T]](v uint64) T { + return ZeroUint[T]().Set64(v) +} diff --git a/pkg/obiiter/batchiterator.go b/pkg/obiiter/batchiterator.go index f6e43fe..f2e5df7 100644 --- a/pkg/obiiter/batchiterator.go +++ b/pkg/obiiter/batchiterator.go @@ -19,18 +19,32 @@ import ( var globalLocker sync.WaitGroup var globalLockerCounter = 0 +// RegisterAPipe increments the global lock counter and adds a new pipe to the global wait group. +// +// No parameters. +// No return values. func RegisterAPipe() { globalLocker.Add(1) globalLockerCounter++ log.Debugln(globalLockerCounter, " Pipes are registered now") } +// UnregisterPipe decrements the global lock counter and signals that a pipe has finished. +// +// No parameters. +// No return values. func UnregisterPipe() { globalLocker.Done() globalLockerCounter-- log.Debugln(globalLockerCounter, "are still registered") } +// WaitForLastPipe waits until all registered pipes have finished. +// +// THe function have to be called at the end of every main function. +// +// No parameters. +// No return values. func WaitForLastPipe() { globalLocker.Wait() } diff --git a/pkg/obikmer/kmermap.go b/pkg/obikmer/kmermap.go index 1511214..a2ed037 100644 --- a/pkg/obikmer/kmermap.go +++ b/pkg/obikmer/kmermap.go @@ -3,98 +3,147 @@ package obikmer import ( "os" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "github.com/schollz/progressbar/v3" + log "github.com/sirupsen/logrus" ) -type KmerMap struct { - index map[KmerIdx64][]*obiseq.BioSequence - kmersize int - kmermask KmerIdx64 +type KmerMap[T obifp.FPUint[T]] struct { + index map[T][]*obiseq.BioSequence + Kmersize uint + kmermask T + + leftMask T + rightMask T + sparseMask T + + SparseAt int } type KmerMatch map[*obiseq.BioSequence]int -func (k *KmerMap) KmerSize() int { - return k.kmersize +func (k *KmerMap[T]) KmerSize() uint { + return k.Kmersize } -func (k *KmerMap) Len() int { +func (k *KmerMap[T]) Len() int { return len(k.index) } -func (k *KmerMap) Push(sequence *obiseq.BioSequence) { - current := KmerIdx64(0) - ccurrent := KmerIdx64(0) - lshift := uint(2 * (k.kmersize - 1)) +func (k *KmerMap[T]) KmerAsString(kmer T) string { + buff := make([]byte, k.Kmersize) + ks := int(k.Kmersize) + + if k.SparseAt >= 0 { + ks-- + } + + for i, j := 0, int(k.Kmersize)-1; i < ks; i++ { + code := kmer.And(obifp.From64[T](3)).AsUint64() + buff[j] = decode[code] + j-- + if k.SparseAt >= 0 && j == k.SparseAt { + buff[j] = '#' + j-- + } + kmer = kmer.RightShift(2) + } + + return string(buff) +} + +func (k *KmerMap[T]) NormalizedKmerSlice(sequence *obiseq.BioSequence, buff *[]T) []T { + + makeSparseAt := func(kmer T) T { + if k.SparseAt == -1 { + return kmer + } + + return kmer.And(k.leftMask).RightShift(2).Or(kmer.And(k.rightMask)) + } + + normalizedKmer := func(fw, rv T) T { + + if k.SparseAt >= 0 { + fw = makeSparseAt(fw) + rv = makeSparseAt(rv) + } + + if fw.LessThan(rv) { + return fw + } + + return rv + } + + current := obifp.ZeroUint[T]() + ccurrent := obifp.ZeroUint[T]() + lshift := uint(2 * (k.Kmersize - 1)) + + sup := sequence.Len() - int(k.Kmersize) + 1 + + var rep []T + if buff == nil { + rep = make([]T, 0, sup) + } else { + rep = (*buff)[:0] + } nuc := sequence.Sequence() + size := 0 - for i := 0; i < len(nuc)-k.kmersize+1; i++ { - current <<= 2 - ccurrent >>= 2 + for i := 0; i < len(nuc); i++ { + current = current.LeftShift(2) + ccurrent = ccurrent.RightShift(2) + code := iupac[nuc[i]] ccode := iupac[revcompnuc[nuc[i]]] if len(code) != 1 { - current = KmerIdx64(0) - ccurrent = KmerIdx64(0) + current = obifp.ZeroUint[T]() + ccurrent = obifp.ZeroUint[T]() size = 0 continue } - current |= KmerIdx64(code[0]) - ccurrent |= KmerIdx64(ccode[0]) << lshift + current = current.Or(obifp.From64[T](uint64(code[0]))) + ccurrent = ccurrent.Or(obifp.From64[T](uint64(ccode[0])).LeftShift(lshift)) + size++ - if size == k.kmersize { + if size == int(k.Kmersize) { - kmer := min(k.kmermask¤t, k.kmermask&ccurrent) - k.index[kmer] = append(k.index[kmer], sequence) + kmer := normalizedKmer(current, ccurrent) + rep = append(rep, kmer) size-- } } + + return rep } -func (k *KmerMap) Query(sequence *obiseq.BioSequence) KmerMatch { - current := KmerIdx64(0) - ccurrent := KmerIdx64(0) +func (k *KmerMap[T]) Push(sequence *obiseq.BioSequence) { + kmers := k.NormalizedKmerSlice(sequence, nil) + for _, kmer := range kmers { + k.index[kmer] = append(k.index[kmer], sequence) + } +} +func (k *KmerMap[T]) Query(sequence *obiseq.BioSequence) KmerMatch { + kmers := k.NormalizedKmerSlice(sequence, nil) rep := make(KmerMatch) - nuc := sequence.Sequence() - size := 0 - for i := 0; i < len(nuc)-k.kmersize+1; i++ { - current <<= 2 - ccurrent >>= 2 - - code := iupac[nuc[i]] - ccode := iupac[revcompnuc[nuc[i]]] - - if len(code) != 1 { - current = KmerIdx64(0) - ccurrent = KmerIdx64(0) - size = 0 - continue - } - - current |= KmerIdx64(code[0]) - ccurrent |= KmerIdx64(ccode[0]) << uint(2*(k.kmersize-1)) - size++ - - if size == k.kmersize { - kmer := min(k.kmermask¤t, k.kmermask&ccurrent) - if _, ok := k.index[kmer]; ok { - for _, seq := range k.index[kmer] { - if seq != sequence { - if _, ok := rep[seq]; !ok { - rep[seq] = 0 - } - rep[seq]++ + for _, kmer := range kmers { + if _, ok := k.index[kmer]; ok { + for _, seq := range k.index[kmer] { + if seq != sequence { + if _, ok := rep[seq]; !ok { + rep[seq] = 0 } + rep[seq]++ } } - size-- } } @@ -135,12 +184,54 @@ func (k *KmerMatch) Max() *obiseq.BioSequence { return maxseq } -func NewKmerMap(sequences obiseq.BioSequenceSlice, kmersize int) *KmerMap { - idx := make(map[KmerIdx64][]*obiseq.BioSequence) +func NewKmerMap[T obifp.FPUint[T]]( + sequences obiseq.BioSequenceSlice, + kmersize uint, + sparse bool) *KmerMap[T] { + idx := make(map[T][]*obiseq.BioSequence) - kmermask := KmerIdx64(^(^uint64(0) << (uint64(kmersize) * 2))) + sparseAt := -1 - kmap := &KmerMap{kmersize: kmersize, kmermask: kmermask, index: idx} + if sparse && kmersize%2 == 0 { + log.Warnf("Kmer size must be odd when using sparse mode") + kmersize++ + } + + if !sparse && kmersize%2 == 1 { + log.Warnf("Kmer size must be even when not using sparse mode") + kmersize-- + + } + + if sparse { + sparseAt = int(kmersize / 2) + } + + kmermask := obifp.OneUint[T]().LeftShift(kmersize * 2).Sub(obifp.OneUint[T]()) + leftMask := obifp.ZeroUint[T]() + rightMask := obifp.ZeroUint[T]() + + if sparseAt >= 0 { + if sparseAt >= int(kmersize) { + sparseAt = -1 + } else { + pos := kmersize - 1 - uint(sparseAt) + left := uint(sparseAt) * 2 + right := pos * 2 + + leftMask = obifp.OneUint[T]().LeftShift(left).Sub(obifp.OneUint[T]()).LeftShift(right + 2) + rightMask = obifp.OneUint[T]().LeftShift(right).Sub(obifp.OneUint[T]()) + } + } + + kmap := &KmerMap[T]{ + Kmersize: kmersize, + kmermask: kmermask, + leftMask: leftMask, + rightMask: rightMask, + index: idx, + SparseAt: sparseAt, + } n := len(sequences) pbopt := make([]progressbar.Option, 0, 5) @@ -163,14 +254,3 @@ func NewKmerMap(sequences obiseq.BioSequenceSlice, kmersize int) *KmerMap { return kmap } - -func (k *KmerMap) MakeCountMatchWorker(minKmerCount int) obiseq.SeqWorker { - return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { - matches := k.Query(sequence) - matches.FilterMinCount(minKmerCount) - n := matches.Len() - - sequence.SetAttribute("obikmer_match_count", n) - return obiseq.BioSequenceSlice{sequence}, nil - } -} diff --git a/pkg/obioptions/version.go b/pkg/obioptions/version.go index 8053eaa..2f26c42 100644 --- a/pkg/obioptions/version.go +++ b/pkg/obioptions/version.go @@ -7,7 +7,7 @@ import ( // TODO: The version number is extracted from git. This induces that the version // corresponds to the last commit, and not the one when the file will be // commited -var _Commit = "3f57935" +var _Commit = "bdb96dd" var _Version = "Release 4.2.0" // Version returns the version of the obitools package. diff --git a/pkg/obitools/obiconvert/sequence_reader.go b/pkg/obitools/obiconvert/sequence_reader.go index e545004..4f72971 100644 --- a/pkg/obitools/obiconvert/sequence_reader.go +++ b/pkg/obitools/obiconvert/sequence_reader.go @@ -188,3 +188,17 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) { return iterator, nil } + +func OpenSequenceDataErrorMessage(args []string, err error) { + if err != nil { + switch len(args) { + case 0: + log.Errorf("Cannot open stdin (%v)", err) + case 1: + log.Errorf("Cannot open file %s: %v", args[0], err) + default: + log.Errorf("Cannot open one of the data files: %v", err) + } + os.Exit(1) + } +} diff --git a/pkg/obitools/obikmersim/obikmersim.go b/pkg/obitools/obikmersim/obikmersim.go new file mode 100644 index 0000000..06c02e7 --- /dev/null +++ b/pkg/obitools/obikmersim/obikmersim.go @@ -0,0 +1,172 @@ +package obikmersim + +import ( + "math" + + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" +) + +func _Abs(x int) int { + if x < 0 { + return -x + } + return x +} + +func MakeCountMatchWorker[T obifp.FPUint[T]](k *obikmer.KmerMap[T], minKmerCount int) obiseq.SeqWorker { + return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + matches := k.Query(sequence) + matches.FilterMinCount(minKmerCount) + n := matches.Len() + + sequence.SetAttribute("obikmer_match_count", n) + sequence.SetAttribute("obikmer_kmer_size", k.Kmersize) + sequence.SetAttribute("obikmer_sparse_kmer", k.SparseAt >= 0) + return obiseq.BioSequenceSlice{sequence}, nil + } +} + +func MakeKmerAlignWorker[T obifp.FPUint[T]]( + k *obikmer.KmerMap[T], + minKmerCount int, + gap, scale float64, delta int, fastScoreRel bool, + minIdentity float64, withStats bool) obiseq.SeqWorker { + return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) { + arena := obialign.MakePEAlignArena(150, 150) + shifts := make(map[int]int) + + matches := k.Query(sequence) + matches.FilterMinCount(minKmerCount) + + slice := obiseq.NewBioSequenceSlice(matches.Len()) + *slice = (*slice)[:0] + + for _, seq := range matches.Sequences() { + idmatched_id := seq.Id() + + score, path, fastcount, over, fastscore, reverse := obialign.ReadAlign( + sequence, seq, + gap, scale, delta, + fastScoreRel, + arena, &shifts, + ) + + if reverse { + idmatched_id = idmatched_id + "-rev" + seq = seq.ReverseComplement(false) + } + + cons, match := obialign.BuildQualityConsensus(sequence, seq, path, true, arena) + + left := path[0] + right := 0 + if path[len(path)-1] == 0 { + right = path[len(path)-2] + } + lcons := cons.Len() + aliLength := lcons - _Abs(left) - _Abs(right) + identity := float64(match) / float64(aliLength) + if aliLength == 0 { + identity = 0 + } + + rep := sequence.Copy() + + rep.SetAttribute("obikmer_match_id", idmatched_id) + rep.SetAttribute("obikmer_fast_count", fastcount) + rep.SetAttribute("obikmer_fast_overlap", over) + rep.SetAttribute("obikmer_fast_score", math.Round(fastscore*1000)/1000) + + if reverse { + rep.SetAttribute("obikmer_orientation", "reverse") + } else { + rep.SetAttribute("obikmer_orientation", "forward") + } + + if aliLength >= int(k.KmerSize()) && identity >= minIdentity { + if withStats { + if left < 0 { + rep.SetAttribute("seq_a_single", -left) + rep.SetAttribute("ali_dir", "left") + } else { + rep.SetAttribute("seq_b_single", left) + rep.SetAttribute("ali_dir", "right") + } + + if right < 0 { + right = -right + rep.SetAttribute("seq_a_single", right) + } else { + rep.SetAttribute("seq_b_single", right) + } + rep.SetAttribute("obikmer_score", score) + scoreNorm := float64(0) + if aliLength > 0 { + scoreNorm = math.Round(float64(match)/float64(aliLength)*1000) / 1000 + } else { + scoreNorm = 0 + } + + rep.SetAttribute("obikmer_score_norm", scoreNorm) + rep.SetAttribute("obikmer_ali_length", aliLength) + + rep.SetAttribute("seq_ab_match", match) + + } + + *slice = append(*slice, rep) + } + + } + + return *slice, nil + } +} + +func CLILookForSharedKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence { + var newIter obiiter.IBioSequence + + source, references := CLIReference() + + if iterator == obiiter.NilIBioSequence { + iterator = obiiter.IBatchOver(source, references, obioptions.CLIBatchSize()) + } + + kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode()) + worker := MakeCountMatchWorker(kmerMatch, CLIMinSharedKmers()) + newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers()) + + if CLISelf() { + newIter = newIter.Speed("Counting similar reads", references.Len()) + } else { + newIter = newIter.Speed("Counting similar reads") + } + + return newIter.FilterEmpty() +} + +func CLIAlignSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence { + var newIter obiiter.IBioSequence + + source, references := CLIReference() + + if iterator == obiiter.NilIBioSequence { + iterator = obiiter.IBatchOver(source, references, obioptions.CLIBatchSize()) + } + + if CLISelf() { + iterator = iterator.Speed("Aligning reads", references.Len()) + } else { + iterator = iterator.Speed("Aligning reads") + } + kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode()) + worker := MakeKmerAlignWorker(kmerMatch, CLIMinSharedKmers(), CLIGap(), CLIScale(), CLIDelta(), CLIFastRelativeScore(), 0.8, true) + newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers()) + + return newIter.FilterEmpty() +} diff --git a/pkg/obitools/obikmersim/options.go b/pkg/obitools/obikmersim/options.go new file mode 100644 index 0000000..fe6bd1d --- /dev/null +++ b/pkg/obitools/obikmersim/options.go @@ -0,0 +1,140 @@ +package obikmersim + +import ( + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" + "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" + "github.com/DavidGamba/go-getoptions" +) + +var _KmerSize = 30 +var _Sparse = false +var _References = []string{} +var _MinSharedKmers = 1 +var _Self = false + +var _Delta = 5 +var _PenalityScale = 1.0 +var _GapPenality = 2.0 +var _FastScoreAbs = false + +// PCROptionSet defines every options related to a simulated PCR. +// +// The function adds to a CLI every options proposed to the user +// to tune the parametters of the PCR simulation algorithm. +// +// # Parameters +// +// - option : is a pointer to a getoptions.GetOpt instance normaly +// produced by the +func KmerSimCountOptionSet(options *getoptions.GetOpt) { + + options.IntVar(&_KmerSize, "kmer-size", _KmerSize, + options.Alias("k"), + options.Description("Kmer size to use.")) + + options.BoolVar(&_Sparse, "sparse", _Sparse, + options.Alias("S"), + options.Description("Set sparse kmer mode.")) + + options.StringSliceVar(&_References, "reference", 1, 1, + options.Alias("r"), + options.Description("Reference sequence.")) + + options.IntVar(&_MinSharedKmers, "min-shared-kmers", _MinSharedKmers, + options.Alias("m"), + options.Description("Minimum number of shared kmers between two sequences.")) + + options.BoolVar(&_Self, "self", _Self, + options.Alias("s"), + options.Description("Compare references with themselves.")) + +} + +func KmerSimMatchOptionSet(options *getoptions.GetOpt) { + options.IntVar(&_Delta, "delta", _Delta, + options.Alias("d"), + options.Description("Delta value for the match.")) + + options.Float64Var(&_PenalityScale, "penality-scale", _PenalityScale, + options.Alias("X"), + options.Description("Scale factor applied to the mismatch score and the gap penality (default 1).")) + + options.Float64Var(&_GapPenality, "gap-penality", _GapPenality, + options.Alias("G"), + options.Description("Gap penality expressed as the multiply factor applied to the mismatch score between two nucleotides with a quality of 40 (default 2).")) + + options.BoolVar(&_FastScoreAbs, "fast-absolute", _FastScoreAbs, + options.Alias("a"), + options.Description("Use fast absolute score mode.")) +} + +func CountOptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + KmerSimCountOptionSet(options) +} + +func MatchOptionSet(options *getoptions.GetOpt) { + obiconvert.OptionSet(options) + KmerSimCountOptionSet(options) + KmerSimMatchOptionSet(options) +} + +func CLIKmerSize() uint { + return uint(_KmerSize) +} + +func CLISparseMode() bool { + return _Sparse +} + +func CLIReference() (string, obiseq.BioSequenceSlice) { + + refnames, err := obiconvert.ExpandListOfFiles(false, _References...) + + if err != nil { + return "", obiseq.BioSequenceSlice{} + } + + nreader := 1 + + if obiconvert.CLINoInputOrder() { + nreader = obioptions.StrictReadWorker() + } + + source, references := obiformats.ReadSequencesBatchFromFiles( + refnames, + obiformats.ReadSequencesFromFile, + nreader).Load() + + return source, references +} + +func CLIMinSharedKmers() int { + return _MinSharedKmers +} + +func CLISelf() bool { + return _Self +} + +func CLIDelta() int { + return _Delta +} + +func CLIScale() float64 { + return _PenalityScale +} + +func CLIGapPenality() float64 { + return _GapPenality +} + +func CLIGap() float64 { + return _GapPenality +} + +func CLIFastRelativeScore() bool { + return !_FastScoreAbs +} diff --git a/pkg/obiutils/uint128.go b/pkg/obiutils/uint128.go deleted file mode 100644 index 3548111..0000000 --- a/pkg/obiutils/uint128.go +++ /dev/null @@ -1,457 +0,0 @@ -package obiutils // import "lukechampine.com/uint128" -// copied from https://github.com/lukechampine/uint128 - -import ( - "encoding/binary" - "errors" - "fmt" - "math" - "math/big" - "math/bits" -) - -// Zero is a zero-valued uint128. -var Zero Uint128 - -// MaxUint128 is the largest possible uint128 value. -var MaxUint128 = New(math.MaxUint64, math.MaxUint64) - -// A Uint128 is an unsigned 128-bit number. -type Uint128 struct { - Lo, Hi uint64 -} - -// IsZero returns true if u == 0. -func (u Uint128) IsZero() bool { - // NOTE: we do not compare against Zero, because that is a global variable - // that could be modified. - return u == Uint128{} -} - -// Equals returns true if u == v. -// -// Uint128 values can be compared directly with ==, but use of the Equals method -// is preferred for consistency. -func (u Uint128) Equals(v Uint128) bool { - return u == v -} - -// Equals64 returns true if u == v. -func (u Uint128) Equals64(v uint64) bool { - return u.Lo == v && u.Hi == 0 -} - -// Cmp compares u and v and returns: -// -// -1 if u < v -// 0 if u == v -// +1 if u > v -// -func (u Uint128) Cmp(v Uint128) int { - if u == v { - return 0 - } else if u.Hi < v.Hi || (u.Hi == v.Hi && u.Lo < v.Lo) { - return -1 - } else { - return 1 - } -} - -// Cmp64 compares u and v and returns: -// -// -1 if u < v -// 0 if u == v -// +1 if u > v -// -func (u Uint128) Cmp64(v uint64) int { - if u.Hi == 0 && u.Lo == v { - return 0 - } else if u.Hi == 0 && u.Lo < v { - return -1 - } else { - return 1 - } -} - -// And returns u&v. -func (u Uint128) And(v Uint128) Uint128 { - return Uint128{u.Lo & v.Lo, u.Hi & v.Hi} -} - -// And64 returns u&v. -func (u Uint128) And64(v uint64) Uint128 { - return Uint128{u.Lo & v, u.Hi & 0} -} - -// Or returns u|v. -func (u Uint128) Or(v Uint128) Uint128 { - return Uint128{u.Lo | v.Lo, u.Hi | v.Hi} -} - -// Or64 returns u|v. -func (u Uint128) Or64(v uint64) Uint128 { - return Uint128{u.Lo | v, u.Hi | 0} -} - -// Xor returns u^v. -func (u Uint128) Xor(v Uint128) Uint128 { - return Uint128{u.Lo ^ v.Lo, u.Hi ^ v.Hi} -} - -// Xor64 returns u^v. -func (u Uint128) Xor64(v uint64) Uint128 { - return Uint128{u.Lo ^ v, u.Hi ^ 0} -} - -// Add returns u+v. -func (u Uint128) Add(v Uint128) Uint128 { - lo, carry := bits.Add64(u.Lo, v.Lo, 0) - hi, carry := bits.Add64(u.Hi, v.Hi, carry) - if carry != 0 { - panic("overflow") - } - return Uint128{lo, hi} -} - -// AddWrap returns u+v with wraparound semantics; for example, -// Max.AddWrap(From64(1)) == Zero. -func (u Uint128) AddWrap(v Uint128) Uint128 { - lo, carry := bits.Add64(u.Lo, v.Lo, 0) - hi, _ := bits.Add64(u.Hi, v.Hi, carry) - return Uint128{lo, hi} -} - -// Add64 returns u+v. -func (u Uint128) Add64(v uint64) Uint128 { - lo, carry := bits.Add64(u.Lo, v, 0) - hi, carry := bits.Add64(u.Hi, 0, carry) - if carry != 0 { - panic("overflow") - } - return Uint128{lo, hi} -} - -// AddWrap64 returns u+v with wraparound semantics; for example, -// Max.AddWrap64(1) == Zero. -func (u Uint128) AddWrap64(v uint64) Uint128 { - lo, carry := bits.Add64(u.Lo, v, 0) - hi := u.Hi + carry - return Uint128{lo, hi} -} - -// Sub returns u-v. -func (u Uint128) Sub(v Uint128) Uint128 { - lo, borrow := bits.Sub64(u.Lo, v.Lo, 0) - hi, borrow := bits.Sub64(u.Hi, v.Hi, borrow) - if borrow != 0 { - panic("underflow") - } - return Uint128{lo, hi} -} - -// SubWrap returns u-v with wraparound semantics; for example, -// Zero.SubWrap(From64(1)) == Max. -func (u Uint128) SubWrap(v Uint128) Uint128 { - lo, borrow := bits.Sub64(u.Lo, v.Lo, 0) - hi, _ := bits.Sub64(u.Hi, v.Hi, borrow) - return Uint128{lo, hi} -} - -// Sub64 returns u-v. -func (u Uint128) Sub64(v uint64) Uint128 { - lo, borrow := bits.Sub64(u.Lo, v, 0) - hi, borrow := bits.Sub64(u.Hi, 0, borrow) - if borrow != 0 { - panic("underflow") - } - return Uint128{lo, hi} -} - -// SubWrap64 returns u-v with wraparound semantics; for example, -// Zero.SubWrap64(1) == Max. -func (u Uint128) SubWrap64(v uint64) Uint128 { - lo, borrow := bits.Sub64(u.Lo, v, 0) - hi := u.Hi - borrow - return Uint128{lo, hi} -} - -// Mul returns u*v, panicking on overflow. -func (u Uint128) Mul(v Uint128) Uint128 { - hi, lo := bits.Mul64(u.Lo, v.Lo) - p0, p1 := bits.Mul64(u.Hi, v.Lo) - p2, p3 := bits.Mul64(u.Lo, v.Hi) - hi, c0 := bits.Add64(hi, p1, 0) - hi, c1 := bits.Add64(hi, p3, c0) - if (u.Hi != 0 && v.Hi != 0) || p0 != 0 || p2 != 0 || c1 != 0 { - panic("overflow") - } - return Uint128{lo, hi} -} - -// MulWrap returns u*v with wraparound semantics; for example, -// Max.MulWrap(Max) == 1. -func (u Uint128) MulWrap(v Uint128) Uint128 { - hi, lo := bits.Mul64(u.Lo, v.Lo) - hi += u.Hi*v.Lo + u.Lo*v.Hi - return Uint128{lo, hi} -} - -// Mul64 returns u*v, panicking on overflow. -func (u Uint128) Mul64(v uint64) Uint128 { - hi, lo := bits.Mul64(u.Lo, v) - p0, p1 := bits.Mul64(u.Hi, v) - hi, c0 := bits.Add64(hi, p1, 0) - if p0 != 0 || c0 != 0 { - panic("overflow") - } - return Uint128{lo, hi} -} - -// MulWrap64 returns u*v with wraparound semantics; for example, -// Max.MulWrap64(2) == Max.Sub64(1). -func (u Uint128) MulWrap64(v uint64) Uint128 { - hi, lo := bits.Mul64(u.Lo, v) - hi += u.Hi * v - return Uint128{lo, hi} -} - -// Div returns u/v. -func (u Uint128) Div(v Uint128) Uint128 { - q, _ := u.QuoRem(v) - return q -} - -// Div64 returns u/v. -func (u Uint128) Div64(v uint64) Uint128 { - q, _ := u.QuoRem64(v) - return q -} - -// QuoRem returns q = u/v and r = u%v. -func (u Uint128) QuoRem(v Uint128) (q, r Uint128) { - if v.Hi == 0 { - var r64 uint64 - q, r64 = u.QuoRem64(v.Lo) - r = From64(r64) - } else { - // generate a "trial quotient," guaranteed to be within 1 of the actual - // quotient, then adjust. - n := uint(bits.LeadingZeros64(v.Hi)) - v1 := v.Lsh(n) - u1 := u.Rsh(1) - tq, _ := bits.Div64(u1.Hi, u1.Lo, v1.Hi) - tq >>= 63 - n - if tq != 0 { - tq-- - } - q = From64(tq) - // calculate remainder using trial quotient, then adjust if remainder is - // greater than divisor - r = u.Sub(v.Mul64(tq)) - if r.Cmp(v) >= 0 { - q = q.Add64(1) - r = r.Sub(v) - } - } - return -} - -// QuoRem64 returns q = u/v and r = u%v. -func (u Uint128) QuoRem64(v uint64) (q Uint128, r uint64) { - if u.Hi < v { - q.Lo, r = bits.Div64(u.Hi, u.Lo, v) - } else { - q.Hi, r = bits.Div64(0, u.Hi, v) - q.Lo, r = bits.Div64(r, u.Lo, v) - } - return -} - -// Mod returns r = u%v. -func (u Uint128) Mod(v Uint128) (r Uint128) { - _, r = u.QuoRem(v) - return -} - -// Mod64 returns r = u%v. -func (u Uint128) Mod64(v uint64) (r uint64) { - _, r = u.QuoRem64(v) - return -} - -// Lsh returns u< 64 { - s.Lo = 0 - s.Hi = u.Lo << (n - 64) - } else { - s.Lo = u.Lo << n - s.Hi = u.Hi<>(64-n) - } - return -} - -// Rsh returns u>>n. -func (u Uint128) Rsh(n uint) (s Uint128) { - if n > 64 { - s.Lo = u.Hi >> (n - 64) - s.Hi = 0 - } else { - s.Lo = u.Lo>>n | u.Hi<<(64-n) - s.Hi = u.Hi >> n - } - return -} - -// LeadingZeros returns the number of leading zero bits in u; the result is 128 -// for u == 0. -func (u Uint128) LeadingZeros() int { - if u.Hi > 0 { - return bits.LeadingZeros64(u.Hi) - } - return 64 + bits.LeadingZeros64(u.Lo) -} - -// TrailingZeros returns the number of trailing zero bits in u; the result is -// 128 for u == 0. -func (u Uint128) TrailingZeros() int { - if u.Lo > 0 { - return bits.TrailingZeros64(u.Lo) - } - return 64 + bits.TrailingZeros64(u.Hi) -} - -// OnesCount returns the number of one bits ("population count") in u. -func (u Uint128) OnesCount() int { - return bits.OnesCount64(u.Hi) + bits.OnesCount64(u.Lo) -} - -// RotateLeft returns the value of u rotated left by (k mod 128) bits. -func (u Uint128) RotateLeft(k int) Uint128 { - const n = 128 - s := uint(k) & (n - 1) - return u.Lsh(s).Or(u.Rsh(n - s)) -} - -// RotateRight returns the value of u rotated left by (k mod 128) bits. -func (u Uint128) RotateRight(k int) Uint128 { - return u.RotateLeft(-k) -} - -// Reverse returns the value of u with its bits in reversed order. -func (u Uint128) Reverse() Uint128 { - return Uint128{bits.Reverse64(u.Hi), bits.Reverse64(u.Lo)} -} - -// ReverseBytes returns the value of u with its bytes in reversed order. -func (u Uint128) ReverseBytes() Uint128 { - return Uint128{bits.ReverseBytes64(u.Hi), bits.ReverseBytes64(u.Lo)} -} - -// Len returns the minimum number of bits required to represent u; the result is -// 0 for u == 0. -func (u Uint128) Len() int { - return 128 - u.LeadingZeros() -} - -// String returns the base-10 representation of u as a string. -func (u Uint128) String() string { - if u.IsZero() { - return "0" - } - buf := []byte("0000000000000000000000000000000000000000") // log10(2^128) < 40 - for i := len(buf); ; i -= 19 { - q, r := u.QuoRem64(1e19) // largest power of 10 that fits in a uint64 - var n int - for ; r != 0; r /= 10 { - n++ - buf[i-n] += byte(r % 10) - } - if q.IsZero() { - return string(buf[i-n:]) - } - u = q - } -} - -// PutBytes stores u in b in little-endian order. It panics if len(b) < 16. -func (u Uint128) PutBytes(b []byte) { - binary.LittleEndian.PutUint64(b[:8], u.Lo) - binary.LittleEndian.PutUint64(b[8:], u.Hi) -} - -// PutBytesBE stores u in b in big-endian order. It panics if len(ip) < 16. -func (u Uint128) PutBytesBE(b []byte) { - binary.BigEndian.PutUint64(b[:8], u.Hi) - binary.BigEndian.PutUint64(b[8:], u.Lo) -} - -// Big returns u as a *big.Int. -func (u Uint128) Big() *big.Int { - i := new(big.Int).SetUint64(u.Hi) - i = i.Lsh(i, 64) - i = i.Xor(i, new(big.Int).SetUint64(u.Lo)) - return i -} - -// Scan implements fmt.Scanner. -func (u *Uint128) Scan(s fmt.ScanState, ch rune) error { - i := new(big.Int) - if err := i.Scan(s, ch); err != nil { - return err - } else if i.Sign() < 0 { - return errors.New("value cannot be negative") - } else if i.BitLen() > 128 { - return errors.New("value overflows Uint128") - } - u.Lo = i.Uint64() - u.Hi = i.Rsh(i, 64).Uint64() - return nil -} - -// New returns the Uint128 value (lo,hi). -func New(lo, hi uint64) Uint128 { - return Uint128{lo, hi} -} - -// From64 converts v to a Uint128 value. -func From64(v uint64) Uint128 { - return New(v, 0) -} - -// FromBytes converts b to a Uint128 value. -func FromBytes(b []byte) Uint128 { - return New( - binary.LittleEndian.Uint64(b[:8]), - binary.LittleEndian.Uint64(b[8:]), - ) -} - -// FromBytesBE converts big-endian b to a Uint128 value. -func FromBytesBE(b []byte) Uint128 { - return New( - binary.BigEndian.Uint64(b[8:]), - binary.BigEndian.Uint64(b[:8]), - ) -} - -// FromBig converts i to a Uint128 value. It panics if i is negative or -// overflows 128 bits. -func FromBig(i *big.Int) (u Uint128) { - if i.Sign() < 0 { - panic("value cannot be negative") - } else if i.BitLen() > 128 { - panic("value overflows Uint128") - } - u.Lo = i.Uint64() - u.Hi = i.Rsh(i, 64).Uint64() - return u -} - -// FromString parses s as a Uint128 value. -func FromString(s string) (u Uint128, err error) { - _, err = fmt.Sscan(s, &u) - return -} - -