Patch a bug on writing to stdout, and add clearer error on openning data files

This commit is contained in:
Eric Coissac
2024-08-13 09:45:28 +02:00
parent bdb96dda94
commit 31bfc88eb9
43 changed files with 1654 additions and 696 deletions

View File

@@ -188,3 +188,17 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
return iterator, nil
}
func OpenSequenceDataErrorMessage(args []string, err error) {
if err != nil {
switch len(args) {
case 0:
log.Errorf("Cannot open stdin (%v)", err)
case 1:
log.Errorf("Cannot open file %s: %v", args[0], err)
default:
log.Errorf("Cannot open one of the data files: %v", err)
}
os.Exit(1)
}
}

View File

@@ -0,0 +1,172 @@
package obikmersim
import (
"math"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obifp"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
)
func _Abs(x int) int {
if x < 0 {
return -x
}
return x
}
func MakeCountMatchWorker[T obifp.FPUint[T]](k *obikmer.KmerMap[T], minKmerCount int) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
matches := k.Query(sequence)
matches.FilterMinCount(minKmerCount)
n := matches.Len()
sequence.SetAttribute("obikmer_match_count", n)
sequence.SetAttribute("obikmer_kmer_size", k.Kmersize)
sequence.SetAttribute("obikmer_sparse_kmer", k.SparseAt >= 0)
return obiseq.BioSequenceSlice{sequence}, nil
}
}
func MakeKmerAlignWorker[T obifp.FPUint[T]](
k *obikmer.KmerMap[T],
minKmerCount int,
gap, scale float64, delta int, fastScoreRel bool,
minIdentity float64, withStats bool) obiseq.SeqWorker {
return func(sequence *obiseq.BioSequence) (obiseq.BioSequenceSlice, error) {
arena := obialign.MakePEAlignArena(150, 150)
shifts := make(map[int]int)
matches := k.Query(sequence)
matches.FilterMinCount(minKmerCount)
slice := obiseq.NewBioSequenceSlice(matches.Len())
*slice = (*slice)[:0]
for _, seq := range matches.Sequences() {
idmatched_id := seq.Id()
score, path, fastcount, over, fastscore, reverse := obialign.ReadAlign(
sequence, seq,
gap, scale, delta,
fastScoreRel,
arena, &shifts,
)
if reverse {
idmatched_id = idmatched_id + "-rev"
seq = seq.ReverseComplement(false)
}
cons, match := obialign.BuildQualityConsensus(sequence, seq, path, true, arena)
left := path[0]
right := 0
if path[len(path)-1] == 0 {
right = path[len(path)-2]
}
lcons := cons.Len()
aliLength := lcons - _Abs(left) - _Abs(right)
identity := float64(match) / float64(aliLength)
if aliLength == 0 {
identity = 0
}
rep := sequence.Copy()
rep.SetAttribute("obikmer_match_id", idmatched_id)
rep.SetAttribute("obikmer_fast_count", fastcount)
rep.SetAttribute("obikmer_fast_overlap", over)
rep.SetAttribute("obikmer_fast_score", math.Round(fastscore*1000)/1000)
if reverse {
rep.SetAttribute("obikmer_orientation", "reverse")
} else {
rep.SetAttribute("obikmer_orientation", "forward")
}
if aliLength >= int(k.KmerSize()) && identity >= minIdentity {
if withStats {
if left < 0 {
rep.SetAttribute("seq_a_single", -left)
rep.SetAttribute("ali_dir", "left")
} else {
rep.SetAttribute("seq_b_single", left)
rep.SetAttribute("ali_dir", "right")
}
if right < 0 {
right = -right
rep.SetAttribute("seq_a_single", right)
} else {
rep.SetAttribute("seq_b_single", right)
}
rep.SetAttribute("obikmer_score", score)
scoreNorm := float64(0)
if aliLength > 0 {
scoreNorm = math.Round(float64(match)/float64(aliLength)*1000) / 1000
} else {
scoreNorm = 0
}
rep.SetAttribute("obikmer_score_norm", scoreNorm)
rep.SetAttribute("obikmer_ali_length", aliLength)
rep.SetAttribute("seq_ab_match", match)
}
*slice = append(*slice, rep)
}
}
return *slice, nil
}
}
func CLILookForSharedKmers(iterator obiiter.IBioSequence) obiiter.IBioSequence {
var newIter obiiter.IBioSequence
source, references := CLIReference()
if iterator == obiiter.NilIBioSequence {
iterator = obiiter.IBatchOver(source, references, obioptions.CLIBatchSize())
}
kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode())
worker := MakeCountMatchWorker(kmerMatch, CLIMinSharedKmers())
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
if CLISelf() {
newIter = newIter.Speed("Counting similar reads", references.Len())
} else {
newIter = newIter.Speed("Counting similar reads")
}
return newIter.FilterEmpty()
}
func CLIAlignSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
var newIter obiiter.IBioSequence
source, references := CLIReference()
if iterator == obiiter.NilIBioSequence {
iterator = obiiter.IBatchOver(source, references, obioptions.CLIBatchSize())
}
if CLISelf() {
iterator = iterator.Speed("Aligning reads", references.Len())
} else {
iterator = iterator.Speed("Aligning reads")
}
kmerMatch := obikmer.NewKmerMap[obifp.Uint64](references, uint(CLIKmerSize()), CLISparseMode())
worker := MakeKmerAlignWorker(kmerMatch, CLIMinSharedKmers(), CLIGap(), CLIScale(), CLIDelta(), CLIFastRelativeScore(), 0.8, true)
newIter = iterator.MakeIWorker(worker, false, obioptions.CLIParallelWorkers())
return newIter.FilterEmpty()
}

View File

@@ -0,0 +1,140 @@
package obikmersim
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions"
)
var _KmerSize = 30
var _Sparse = false
var _References = []string{}
var _MinSharedKmers = 1
var _Self = false
var _Delta = 5
var _PenalityScale = 1.0
var _GapPenality = 2.0
var _FastScoreAbs = false
// PCROptionSet defines every options related to a simulated PCR.
//
// The function adds to a CLI every options proposed to the user
// to tune the parametters of the PCR simulation algorithm.
//
// # Parameters
//
// - option : is a pointer to a getoptions.GetOpt instance normaly
// produced by the
func KmerSimCountOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_KmerSize, "kmer-size", _KmerSize,
options.Alias("k"),
options.Description("Kmer size to use."))
options.BoolVar(&_Sparse, "sparse", _Sparse,
options.Alias("S"),
options.Description("Set sparse kmer mode."))
options.StringSliceVar(&_References, "reference", 1, 1,
options.Alias("r"),
options.Description("Reference sequence."))
options.IntVar(&_MinSharedKmers, "min-shared-kmers", _MinSharedKmers,
options.Alias("m"),
options.Description("Minimum number of shared kmers between two sequences."))
options.BoolVar(&_Self, "self", _Self,
options.Alias("s"),
options.Description("Compare references with themselves."))
}
func KmerSimMatchOptionSet(options *getoptions.GetOpt) {
options.IntVar(&_Delta, "delta", _Delta,
options.Alias("d"),
options.Description("Delta value for the match."))
options.Float64Var(&_PenalityScale, "penality-scale", _PenalityScale,
options.Alias("X"),
options.Description("Scale factor applied to the mismatch score and the gap penality (default 1)."))
options.Float64Var(&_GapPenality, "gap-penality", _GapPenality,
options.Alias("G"),
options.Description("Gap penality expressed as the multiply factor applied to the mismatch score between two nucleotides with a quality of 40 (default 2)."))
options.BoolVar(&_FastScoreAbs, "fast-absolute", _FastScoreAbs,
options.Alias("a"),
options.Description("Use fast absolute score mode."))
}
func CountOptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
KmerSimCountOptionSet(options)
}
func MatchOptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options)
KmerSimCountOptionSet(options)
KmerSimMatchOptionSet(options)
}
func CLIKmerSize() uint {
return uint(_KmerSize)
}
func CLISparseMode() bool {
return _Sparse
}
func CLIReference() (string, obiseq.BioSequenceSlice) {
refnames, err := obiconvert.ExpandListOfFiles(false, _References...)
if err != nil {
return "", obiseq.BioSequenceSlice{}
}
nreader := 1
if obiconvert.CLINoInputOrder() {
nreader = obioptions.StrictReadWorker()
}
source, references := obiformats.ReadSequencesBatchFromFiles(
refnames,
obiformats.ReadSequencesFromFile,
nreader).Load()
return source, references
}
func CLIMinSharedKmers() int {
return _MinSharedKmers
}
func CLISelf() bool {
return _Self
}
func CLIDelta() int {
return _Delta
}
func CLIScale() float64 {
return _PenalityScale
}
func CLIGapPenality() float64 {
return _GapPenality
}
func CLIGap() float64 {
return _GapPenality
}
func CLIFastRelativeScore() bool {
return !_FastScoreAbs
}