change the model for representing paired reads and extend its usage to other commands

This commit is contained in:
2023-02-23 23:35:58 +01:00
parent ebb05fcdf7
commit 072b85e155
23 changed files with 598 additions and 338 deletions

View File

@@ -27,6 +27,9 @@ var __output_solexa_quality__ = false
var __no_progress_bar__ = false
var __compressed__ = false
var __output_file_name__ = "-"
var __paired_file_name__ = ""
func InputOptionSet(options *getoptions.GetOpt) {
// options.IntVar(&__skipped_entries__, "skip", __skipped_entries__,
// options.Description("The N first sequence records of the file are discarded from the analysis and not reported to the output file."))
@@ -73,15 +76,29 @@ func OutputOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
options.Description("Disable the progress bar printing"))
options.BoolVar(&__compressed__, "--compress", false,
options.BoolVar(&__compressed__, "compress", false,
options.Alias("Z"),
options.Description("Output is compressed"))
options.StringVar(&__output_file_name__, "out", __output_file_name__,
options.Alias("o"),
options.ArgName("FILENAME"),
options.Description("Filename used for saving the output"),
)
}
func PairedFilesOptionSet(options *getoptions.GetOpt) {
options.StringVar(&__paired_file_name__, "paired-with", __paired_file_name__,
options.ArgName("FILENAME"),
options.Description("Filename containing the paired reads"),
)
}
func OptionSet(options *getoptions.GetOpt) {
InputOptionSet(options)
OutputOptionSet(options)
PairedFilesOptionSet(options)
}
// Returns true if the number of reads described in the
@@ -170,3 +187,14 @@ func CLIOutputQualityShift() int {
func CLIProgressBar() bool {
return !__no_progress_bar__
}
func CLIOutPutFileName() string {
return __output_file_name__
}
func CLIHasPairedFile() bool {
return __paired_file_name__ != ""
}
func CLIPairedFileName() string {
return __paired_file_name__
}

View File

@@ -67,7 +67,7 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
return list_of_files, nil
}
func ReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
var iterator obiiter.IBioSequence
var reader func(string, ...obiformats.WithOption) (obiiter.IBioSequence, error)
@@ -142,6 +142,17 @@ func ReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
if err != nil {
return obiiter.NilIBioSequence, err
}
if CLIPairedFileName() != "" {
ip, err := reader(CLIPairedFileName(), opts...)
if err != nil {
return obiiter.NilIBioSequence, err
}
iterator = iterator.PairTo(ip)
}
}
// list_of_files = list_of_files[1:]

View File

@@ -1,6 +1,9 @@
package obiconvert
import (
"path/filepath"
"strings"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
@@ -8,6 +11,27 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
)
func BuildPairedFileNames(filename string) (string, string) {
dir, name := filepath.Split(filename)
parts := strings.SplitN(name, ".", 2)
forward := parts[0] + "_R1"
reverse := parts[0] + "_R2"
if parts[1] != "" {
suffix := "." + parts[1]
forward += suffix
reverse += suffix
}
if dir != "" {
forward = filepath.Join(dir, forward)
reverse = filepath.Join(dir, reverse)
}
return forward, reverse
}
func CLIWriteBioSequences(iterator obiiter.IBioSequence,
terminalAction bool, filenames ...string) (obiiter.IBioSequence, error) {
@@ -45,7 +69,32 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
var err error
if len(filenames) == 0 {
// No file names are specified or it is "-" : the output is done on stdout
if CLIOutPutFileName() != "-" || (len(filenames) > 0 && filenames[0] != "-") {
var fn string
if len(filenames) == 0 {
fn = CLIOutPutFileName()
} else {
fn = filenames[0]
}
if iterator.IsPaired() {
var reverse string
fn, reverse = BuildPairedFileNames(fn)
opts = append(opts, obiformats.WritePairedReadsTo(reverse))
}
switch CLIOutputFormat() {
case "fastq":
newIter, err = obiformats.WriteFastqToFile(iterator, fn, opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToFile(iterator, fn, opts...)
default:
newIter, err = obiformats.WriteSequencesToFile(iterator, fn, opts...)
}
} else {
switch CLIOutputFormat() {
case "fastq":
newIter, err = obiformats.WriteFastqToStdout(iterator, opts...)
@@ -54,15 +103,6 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
default:
newIter, err = obiformats.WriteSequencesToStdout(iterator, opts...)
}
} else {
switch CLIOutputFormat() {
case "fastq":
newIter, err = obiformats.WriteFastqToFile(iterator, filenames[0], opts...)
case "fasta":
newIter, err = obiformats.WriteFastaToFile(iterator, filenames[0], opts...)
default:
newIter, err = obiformats.WriteSequencesToFile(iterator, filenames[0], opts...)
}
}
if err != nil {

View File

@@ -8,11 +8,15 @@ import (
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
)
func IFilterSequence(iterator obiiter.IBioSequence) obiiter.IBioSequence {
func CLIFilterSequence(iterator obiiter.IBioSequence) obiiter.IBioSequence {
var newIter obiiter.IBioSequence
predicate := CLISequenceSelectionPredicate()
if obiconvert.CLIHasPairedFile() {
predicate = predicate.PairedPredicat(CLIPairedReadMode())
}
if predicate != nil {
if CLISaveDiscardedSequences() {
var discarded obiiter.IBioSequence

View File

@@ -40,6 +40,8 @@ var _AttributePatterns = make(map[string]string, 0)
var _InvertMatch = false
var _SaveRejected = ""
var _PairedMode = "forward"
func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_Taxdump, "taxdump", _Taxdump,
@@ -135,6 +137,11 @@ func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
"Several -a options can be used on the same command line and in this last case, the selected "+
"sequence records will match all constraints."))
options.StringVar(&_PairedMode, "paired-mode", _PairedMode,
options.ArgName("forward|reverse|and|or|andnot|xor"),
options.Description("If paired reads are passed to obibrep, that option determines how the conditions "+
"are applied to both reads."),
)
}
// OptionSet adds to the basic option set every options declared for
@@ -412,3 +419,24 @@ func CLISaveDiscardedSequences() bool {
func CLIDiscardedFileName() string {
return _SaveRejected
}
func CLIPairedReadMode() obiseq.SeqPredicateMode {
switch _PairedMode {
case "forward":
return obiseq.ForwardOnly
case "reverse":
return obiseq.ReverseOnly
case "and":
return obiseq.And
case "or":
return obiseq.Or
case "andnot":
return obiseq.AndNot
case "xor":
return obiseq.Xor
default:
log.Fatalf("Paired reads mode must be forward, reverse, and, or, andnot, or xor (%s)", _PairedMode)
}
return obiseq.ForwardOnly
}

View File

@@ -6,8 +6,8 @@ import (
"github.com/DavidGamba/go-getoptions"
)
var _ForwardFiles = make([]string, 0, 10)
var _ReverseFiles = make([]string, 0, 10)
var _ForwardFile = ""
var _ReverseFile = ""
var _Delta = 5
var _MinOverlap = 20
var _GapPenality = float64(2.0)
@@ -15,15 +15,15 @@ var _WithoutStats = false
var _MinIdentity = 0.9
func PairingOptionSet(options *getoptions.GetOpt) {
options.StringSliceVar(&_ForwardFiles, "forward-reads",
1, 1000,
options.StringVar(&_ForwardFile, "forward-reads", "",
options.Alias("F"),
options.Required("You must provide at least one forward file"),
options.ArgName("FILENAME_F"),
options.Required("You must provide at a forward file"),
options.Description("The file names containing the forward reads"))
options.StringSliceVar(&_ReverseFiles, "reverse-reads",
1, 1000,
options.StringVar(&_ReverseFile, "reverse-reads", "",
options.Alias("R"),
options.Required("You must provide at least one reverse file"),
options.ArgName("FILENAME_R"),
options.Required("You must provide a reverse file"),
options.Description("The file names containing the reverse reads"))
options.IntVar(&_Delta, "delta", _Delta,
options.Alias("D"),
@@ -42,42 +42,43 @@ func PairingOptionSet(options *getoptions.GetOpt) {
}
func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
obiconvert.OutputOptionSet(options)
obiconvert.InputOptionSet(options)
PairingOptionSet(options)
}
func IBatchPairedSequence() (obiiter.IPairedBioSequenceBatch, error) {
forward, err := obiconvert.ReadBioSequences(_ForwardFiles...)
func CLIPairedSequence() (obiiter.IBioSequence, error) {
forward, err := obiconvert.CLIReadBioSequences(_ForwardFile)
if err != nil {
return obiiter.NilIPairedBioSequenceBatch, err
return obiiter.NilIBioSequence, err
}
reverse, err := obiconvert.ReadBioSequences(_ReverseFiles...)
reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile)
if err != nil {
return obiiter.NilIPairedBioSequenceBatch, err
return obiiter.NilIBioSequence, err
}
paired := forward.PairWith(reverse)
paired := forward.PairTo(reverse)
return paired, nil
}
func Delta() int {
func CLIDelta() int {
return _Delta
}
func MinOverlap() int {
func CLIMinOverlap() int {
return _MinOverlap
}
func MinIdentity() float64 {
func CLIMinIdentity() float64 {
return _MinIdentity
}
func GapPenality() float64 {
func CLIGapPenality() float64 {
return _GapPenality
}
func WithStats() bool {
func CLIWithStats() bool {
return !_WithoutStats
}

View File

@@ -3,12 +3,12 @@ package obipairing
import (
"math"
"os"
"runtime"
log "github.com/sirupsen/logrus"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obialign"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
"github.com/schollz/progressbar/v3"
)
@@ -203,12 +203,16 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
//
// The function returns an iterator over batches of obiseq.Biosequence object.
// each pair of processed sequences produces one sequence in the result iterator.
func IAssemblePESequencesBatch(iterator obiiter.IPairedBioSequenceBatch,
func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
gap float64, delta, minOverlap int,
minIdentity float64,
withStats bool, sizes ...int) obiiter.IBioSequence {
nworkers := runtime.NumCPU() * 3 / 2
if !iterator.IsPaired() {
log.Fatalln("Sequence data must be paired")
}
nworkers := obioptions.CLIMaxCPU() * 3 / 2
buffsize := iterator.BufferSize()
if len(sizes) > 0 {
@@ -236,15 +240,15 @@ func IAssemblePESequencesBatch(iterator obiiter.IPairedBioSequenceBatch,
progressbar.OptionShowIts(),
progressbar.OptionSetDescription("[Sequence Pairing]"))
f := func(iterator obiiter.IPairedBioSequenceBatch, wid int) {
f := func(iterator obiiter.IBioSequence, wid int) {
arena := obialign.MakePEAlignArena(150, 150)
for iterator.Next() {
batch := iterator.Get()
cons := make(obiseq.BioSequenceSlice, len(batch.Forward()))
cons := make(obiseq.BioSequenceSlice, len(batch.Slice()))
processed := 0
for i, A := range batch.Forward() {
B := batch.Reverse()[i]
for i, A := range batch.Slice() {
B := A.PairedWith()
cons[i] = AssemblePESequences(A, B, gap, delta, minOverlap, minIdentity, withStats, true, arena)
if i%59 == 0 {
bar.Add(59)