Add some code refactoring from the blackboard branch

This commit is contained in:
Eric Coissac
2024-08-02 12:35:46 +02:00
parent bc1aaaf7d9
commit 1b1cd41fd3
38 changed files with 491 additions and 330 deletions

View File

@@ -57,7 +57,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
return samples
}
func annotateOBIClean(dataset obiseq.BioSequenceSlice,
func annotateOBIClean(source string, dataset obiseq.BioSequenceSlice,
sample map[string]*([]*seqPCR),
tag, NAValue string) obiiter.IBioSequence {
batchsize := 1000
@@ -91,7 +91,7 @@ func annotateOBIClean(dataset obiseq.BioSequenceSlice,
return data, nil
}
iter := obiiter.IBatchOver(dataset, batchsize)
iter := obiiter.IBatchOver(source, dataset, batchsize)
riter := iter.MakeISliceWorker(annot, false)
return riter
@@ -288,7 +288,7 @@ func Weight(sequence *obiseq.BioSequence) map[string]int {
func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
db := itertator.Load()
source, db := itertator.Load()
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
@@ -365,7 +365,7 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
}
iter := annotateOBIClean(db, samples, SampleAttribute(), "NA")
iter := annotateOBIClean(source, db, samples, SampleAttribute(), "NA")
if OnlyHead() {
iter = iter.FilterOn(IsHead, 1000)

View File

@@ -274,11 +274,11 @@ func ICleanDB(itertator obiiter.IBioSequence) obiiter.IBioSequence {
// obioptions.CLIParallelWorkers(),
// )
references := annotated.Load()
source, references := annotated.Load()
mannwithney := MakeSequenceFamilyGenusWorker(references)
partof := obiiter.IBatchOver(references,
partof := obiiter.IBatchOver(source, references,
obioptions.CLIBatchSize())
// genera_iterator, err := obichunk.ISequenceChunk(

View File

@@ -46,7 +46,12 @@ func BuildConsensus(seqs obiseq.BioSequenceSlice,
if err == nil {
defer fasta.Close()
fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(0, seqs), obiformats.FormatFastSeqJsonHeader, false))
fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(
fmt.Sprintf("%s_consensus", consensus_id),
0,
seqs,
),
obiformats.FormatFastSeqJsonHeader, false).Bytes())
fasta.Close()
}
@@ -333,7 +338,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
dirname := CLIGraphFilesDirectory()
newIter := obiiter.MakeIBioSequence()
db := itertator.Load()
source, db := itertator.Load()
log.Infof("Sequence dataset of %d sequeences loaded\n", len(db))
@@ -394,7 +399,7 @@ func CLIOBIMinion(itertator obiiter.IBioSequence) obiiter.IBioSequence {
CLISampleAttribute(),
CLIKmerSize())
newIter.Push(obiiter.MakeBioSequenceBatch(sample_order, denoised))
newIter.Push(obiiter.MakeBioSequenceBatch(source, sample_order, denoised))
sample_order++
}

View File

@@ -14,7 +14,7 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
)
func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
var err error
list_of_files := orderedset.NewOrderedSet()
for _, fn := range filenames {
@@ -39,7 +39,7 @@ func _ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
if info.IsDir() {
if path != fn {
subdir, e := _ExpandListOfFiles(true, path)
subdir, e := ExpandListOfFiles(true, path)
if e != nil {
return e
}
@@ -113,19 +113,26 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
log.Printf("Reading sequences from stdin in %s\n", CLIInputFormat())
opts = append(opts, obiformats.OptionsSource("stdin"))
var err error
switch CLIInputFormat() {
case "ecopcr":
iterator = obiformats.ReadEcoPCR(os.Stdin, opts...)
iterator, err = obiformats.ReadEcoPCR(os.Stdin, opts...)
case "embl":
iterator = obiformats.ReadEMBL(os.Stdin, opts...)
iterator, err = obiformats.ReadEMBL(os.Stdin, opts...)
case "genbank":
iterator = obiformats.ReadGenbank(os.Stdin, opts...)
iterator, err = obiformats.ReadGenbank(os.Stdin, opts...)
default:
iterator = obiformats.ReadFastSeqFromStdin(opts...)
}
if err != nil {
return obiiter.NilIBioSequence, err
}
} else {
list_of_files, err := _ExpandListOfFiles(false, filenames...)
list_of_files, err := ExpandListOfFiles(false, filenames...)
if err != nil {
return obiiter.NilIBioSequence, err
}

View File

@@ -129,7 +129,7 @@ func CLIJoinSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Fatalf("Cannot read the data file to merge with: %s %v", CLIJoinWith(), err)
}
data := data_iter.Load()
_, data := data_iter.Load()
keys := CLIBy()

View File

@@ -103,7 +103,7 @@ func MapOnLandmarkSequences(library obiseq.BioSequenceSlice, landmark_idx []int,
// which landmark it corresponds.
func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSequence {
library := iterator.Load()
source, library := iterator.Load()
library_size := len(library)
n_landmark := CLINCenter()
@@ -191,6 +191,6 @@ func CLISelectLandmarkSequences(iterator obiiter.IBioSequence) obiiter.IBioSeque
}
}
return obiiter.IBatchOver(library, obioptions.CLIBatchSize())
return obiiter.IBatchOver(source, library, obioptions.CLIBatchSize())
}

View File

@@ -255,6 +255,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
delta, minOverlap, minIdentity, withStats, true, fastAlign, fastModeRel, arena, &shifts)
}
newIter.Push(obiiter.MakeBioSequenceBatch(
batch.Source(),
batch.Order(),
cons,
))

View File

@@ -130,7 +130,7 @@ func MakeIndexingSliceWorker(indexslot, idslot string,
func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Infoln("Family level reference database indexing...")
log.Infoln("Loading database...")
references := iterator.Load()
source, references := iterator.Load()
nref := len(references)
log.Infof("Done. Database contains %d sequences", nref)
@@ -154,7 +154,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Info("done")
partof := obiiter.IBatchOver(references,
partof := obiiter.IBatchOver(source, references,
obioptions.CLIBatchSize()).MakeIWorker(taxonomy.MakeSetSpeciesWorker(),
false,
obioptions.CLIParallelWorkers(),
@@ -243,7 +243,7 @@ func IndexFamilyDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
waiting.Wait()
results := obiiter.IBatchOver(references,
results := obiiter.IBatchOver(source, references,
obioptions.CLIBatchSize()).Speed("Writing db", nref)
return results

View File

@@ -125,7 +125,7 @@ func IndexSequence(seqidx int,
func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
log.Infoln("Loading database...")
references := iterator.Load()
source, references := iterator.Load()
log.Infof("Done. Database contains %d sequences", len(references))
taxo, error := obifind.CLILoadSelectedTaxonomy()
@@ -204,7 +204,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
sl = append(sl, iref)
bar.Add(1)
}
indexed.Push(obiiter.MakeBioSequenceBatch(l[0]/10, sl))
indexed.Push(obiiter.MakeBioSequenceBatch(source, l[0]/10, sl))
}
indexed.Done()

View File

@@ -57,7 +57,9 @@ func CLIRefDB() obiseq.BioSequenceSlice {
log.Panicf("Cannot open the reference library file : %s\n", _RefDB)
}
return refdb.Load()
_, db := refdb.Load()
return db
}
func CLIGeometricMode() bool {
@@ -70,7 +72,7 @@ func CLIShouldISaveRefDB() bool {
func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) {
if CLIShouldISaveRefDB() {
idb := obiiter.IBatchOver(db, 1000)
idb := obiiter.IBatchOver("", db, 1000)
var newIter obiiter.IBioSequence

View File

@@ -57,7 +57,9 @@ func CLIRefDB() obiseq.BioSequenceSlice {
log.Panicf("Cannot open the reference library file : %s\n", _RefDB)
}
return refdb.Load()
_, db := refdb.Load()
return db
}
func CLIGeometricMode() bool {
@@ -70,7 +72,7 @@ func CLIShouldISaveRefDB() bool {
func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) {
if CLIShouldISaveRefDB() {
idb := obiiter.IBatchOver(db, 1000)
idb := obiiter.IBatchOver("", db, 1000)
var newIter obiiter.IBioSequence