mirror of
https://github.com/metabarcoding/obitools4.git
synced 2025-06-29 16:20:46 +00:00
several correction in obitag and adds the --save-db option
Former-commit-id: b3a15b1a1b3971388a8749faaba519028f6e976c
This commit is contained in:
@ -200,7 +200,7 @@ func IndexReferenceDB(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
for i := l[0]; i < l[1]; i++ {
|
for i := l[0]; i < l[1]; i++ {
|
||||||
idx := IndexSequence(i, references, &refcounts, &taxa, taxo)
|
idx := IndexSequence(i, references, &refcounts, &taxa, taxo)
|
||||||
iref := references[i].Copy()
|
iref := references[i].Copy()
|
||||||
iref.SetAttribute("obitag_ref_index", idx)
|
iref.SetOBITagRefIndex(idx)
|
||||||
sl = append(sl, iref)
|
sl = append(sl, iref)
|
||||||
bar.Add(1)
|
bar.Add(1)
|
||||||
}
|
}
|
||||||
|
@ -12,7 +12,6 @@ import (
|
|||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitax"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obirefidx"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiutils"
|
||||||
)
|
)
|
||||||
@ -91,7 +90,7 @@ func FindClosests(sequence *obiseq.BioSequence,
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//log.Println("that's all falks", maxe, bestId, bestidxs)
|
log.Debugln("Closest Match", sequence.Id(), maxe, bestId, bestidxs, len(bests))
|
||||||
return bests, maxe, bestId, bestmatch, bestidxs
|
return bests, maxe, bestId, bestmatch, bestidxs
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -102,52 +101,94 @@ func Identify(sequence *obiseq.BioSequence,
|
|||||||
taxo *obitax.Taxonomy,
|
taxo *obitax.Taxonomy,
|
||||||
runExact bool) *obiseq.BioSequence {
|
runExact bool) *obiseq.BioSequence {
|
||||||
bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact)
|
bests, differences, identity, bestmatch, seqidxs := FindClosests(sequence, references, refcounts, runExact)
|
||||||
|
|
||||||
taxon := (*obitax.TaxNode)(nil)
|
taxon := (*obitax.TaxNode)(nil)
|
||||||
|
|
||||||
for i, best := range bests {
|
if identity >= 0.5 && differences > 0 {
|
||||||
idx := best.OBITagRefIndex()
|
newidx := 0
|
||||||
if idx == nil {
|
for i, best := range bests {
|
||||||
// log.Fatalln("Need of indexing")
|
idx := best.OBITagRefIndex()
|
||||||
idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo)
|
if idx == nil {
|
||||||
|
// log.Debugln("Need of indexing")
|
||||||
|
newidx++
|
||||||
|
idx = obirefidx.IndexSequence(seqidxs[i], references, &refcounts, &taxa, taxo)
|
||||||
|
references[seqidxs[i]].SetOBITagRefIndex(idx)
|
||||||
|
log.Debugln(references[seqidxs[i]].Id(), idx)
|
||||||
|
}
|
||||||
|
|
||||||
|
d := differences
|
||||||
|
identification, ok := idx[d]
|
||||||
|
found := false
|
||||||
|
var parts []string
|
||||||
|
|
||||||
|
/*
|
||||||
|
Here is an horrible hack for xprize challence.
|
||||||
|
With Euka01 the part[0] was equal to "" for at
|
||||||
|
least a sequence consensus. Which is not normal.
|
||||||
|
|
||||||
|
TO BE CHECKED AND CORRECTED
|
||||||
|
|
||||||
|
The problem seems related to idx that doesn't have
|
||||||
|
a 0 distance
|
||||||
|
*/
|
||||||
|
for !found && d >= 0 {
|
||||||
|
for !ok && d >= 0 {
|
||||||
|
identification, ok = idx[d]
|
||||||
|
d--
|
||||||
|
}
|
||||||
|
|
||||||
|
parts = strings.Split(identification, "@")
|
||||||
|
|
||||||
|
found = parts[0] != ""
|
||||||
|
if !found {
|
||||||
|
log.Debugln("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d)
|
||||||
|
for !ok && d <= 1000 {
|
||||||
|
identification, ok = idx[d]
|
||||||
|
d++
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
match_taxid, err := strconv.Atoi(parts[0])
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Panicln("Cannot extract taxid from :", identification)
|
||||||
|
}
|
||||||
|
|
||||||
|
match_taxon, err := taxo.Taxon(match_taxid)
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid)
|
||||||
|
}
|
||||||
|
|
||||||
|
if taxon != nil {
|
||||||
|
taxon, _ = taxon.LCA(match_taxon)
|
||||||
|
} else {
|
||||||
|
taxon = match_taxon
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
log.Debugln(sequence.Id(), "Best matches:", len(bests), "New index:", newidx)
|
||||||
|
|
||||||
d := differences
|
sequence.SetTaxid(taxon.Taxid())
|
||||||
identification, ok := idx[d]
|
sequence.SetAttribute("scientific_name", taxon.ScientificName())
|
||||||
for !ok && d >= 0 {
|
sequence.SetAttribute("obitag_rank", taxon.Rank())
|
||||||
identification, ok = idx[d]
|
sequence.SetAttribute("obitag_bestid", identity)
|
||||||
d--
|
sequence.SetAttribute("obitag_difference", differences)
|
||||||
}
|
sequence.SetAttribute("obitag_bestmatch", bestmatch)
|
||||||
|
sequence.SetAttribute("obitag_match_count", len(bests))
|
||||||
parts := strings.Split(identification, "@")
|
|
||||||
match_taxid, err := strconv.Atoi(parts[0])
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
log.Panicln("Cannot extract taxid from :", identification)
|
|
||||||
}
|
|
||||||
|
|
||||||
match_taxon, err := taxo.Taxon(match_taxid)
|
|
||||||
|
|
||||||
if err != nil {
|
|
||||||
log.Panicln("Cannot find taxon corresponding to taxid :", match_taxid)
|
|
||||||
}
|
|
||||||
|
|
||||||
if taxon != nil {
|
|
||||||
taxon, _ = taxon.LCA(match_taxon)
|
|
||||||
} else {
|
|
||||||
taxon = match_taxon
|
|
||||||
}
|
|
||||||
|
|
||||||
|
} else {
|
||||||
|
taxon, _ = taxo.Taxon(1)
|
||||||
|
sequence.SetTaxid(1)
|
||||||
|
sequence.SetAttribute("scientific_name", taxon.ScientificName())
|
||||||
|
sequence.SetAttribute("obitag_rank", taxon.Rank())
|
||||||
|
sequence.SetAttribute("obitag_bestid", identity)
|
||||||
|
sequence.SetAttribute("obitag_difference", differences)
|
||||||
|
sequence.SetAttribute("obitag_bestmatch", bestmatch)
|
||||||
|
sequence.SetAttribute("obitag_match_count", len(bests))
|
||||||
}
|
}
|
||||||
|
|
||||||
sequence.SetTaxid(taxon.Taxid())
|
|
||||||
sequence.SetAttribute("scientific_name", taxon.ScientificName())
|
|
||||||
sequence.SetAttribute("obitag_rank", taxon.Rank())
|
|
||||||
sequence.SetAttribute("obitag_bestid", identity)
|
|
||||||
sequence.SetAttribute("obitag_difference", differences)
|
|
||||||
sequence.SetAttribute("obitag_bestmatch", bestmatch)
|
|
||||||
sequence.SetAttribute("obitag_match_count", len(bests))
|
|
||||||
|
|
||||||
return sequence
|
return sequence
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -161,15 +202,11 @@ func IdentifySeqWorker(references obiseq.BioSequenceSlice,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
|
||||||
|
references obiseq.BioSequenceSlice,
|
||||||
|
taxo *obitax.Taxonomy,
|
||||||
|
) obiiter.IBioSequence {
|
||||||
|
|
||||||
taxo, error := obifind.CLILoadSelectedTaxonomy()
|
|
||||||
|
|
||||||
if error != nil {
|
|
||||||
log.Panicln(error)
|
|
||||||
}
|
|
||||||
|
|
||||||
references := CLIRefDB()
|
|
||||||
refcounts := make(
|
refcounts := make(
|
||||||
[]*obikmer.Table4mer,
|
[]*obikmer.Table4mer,
|
||||||
len(references))
|
len(references))
|
||||||
@ -186,5 +223,5 @@ func AssignTaxonomy(iterator obiiter.IBioSequence) obiiter.IBioSequence {
|
|||||||
|
|
||||||
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
|
worker := IdentifySeqWorker(references, refcounts, taxa, taxo, CLIRunExact())
|
||||||
|
|
||||||
return iterator.Rebatch(17).MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0).Rebatch(1000)
|
return iterator.MakeIWorker(worker, obioptions.CLIParallelWorkers(), 0)
|
||||||
}
|
}
|
||||||
|
@ -4,6 +4,8 @@ import (
|
|||||||
"log"
|
"log"
|
||||||
|
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiformats"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiiter"
|
||||||
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obioptions"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obiseq"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obiconvert"
|
||||||
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
|
"git.metabarcoding.org/lecasofts/go/obitools/pkg/obitools/obifind"
|
||||||
@ -11,6 +13,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
var _RefDB = ""
|
var _RefDB = ""
|
||||||
|
var _SaveRefDB = ""
|
||||||
var _RunExact = false
|
var _RunExact = false
|
||||||
|
|
||||||
func TagOptionSet(options *getoptions.GetOpt) {
|
func TagOptionSet(options *getoptions.GetOpt) {
|
||||||
@ -20,6 +23,10 @@ func TagOptionSet(options *getoptions.GetOpt) {
|
|||||||
options.ArgName("FILENAME"),
|
options.ArgName("FILENAME"),
|
||||||
options.Description("The name of the file containing the reference DB"))
|
options.Description("The name of the file containing the reference DB"))
|
||||||
|
|
||||||
|
options.StringVar(&_SaveRefDB, "save-db", _SaveRefDB,
|
||||||
|
options.ArgName("FILENAME"),
|
||||||
|
options.Description("The name of a file where to save the reference DB with its indices"))
|
||||||
|
|
||||||
// options.BoolVar(&_RunExact, "exact", _RunExact,
|
// options.BoolVar(&_RunExact, "exact", _RunExact,
|
||||||
// options.Alias("E"),
|
// options.Alias("E"),
|
||||||
// options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
|
// options.Description("Unactivate the heuristic limatitating the sequence comparisons"))
|
||||||
@ -48,6 +55,59 @@ func CLIRefDB() obiseq.BioSequenceSlice {
|
|||||||
return refdb.Load()
|
return refdb.Load()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CLIShouldISaveRefDB() bool {
|
||||||
|
return _SaveRefDB != ""
|
||||||
|
}
|
||||||
|
|
||||||
|
func CLISaveRefetenceDB(db obiseq.BioSequenceSlice) {
|
||||||
|
if CLIShouldISaveRefDB() {
|
||||||
|
idb := obiiter.IBatchOver(db, 1000)
|
||||||
|
|
||||||
|
var newIter obiiter.IBioSequence
|
||||||
|
|
||||||
|
opts := make([]obiformats.WithOption, 0, 10)
|
||||||
|
|
||||||
|
switch obiconvert.CLIOutputFastHeaderFormat() {
|
||||||
|
case "json":
|
||||||
|
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
|
||||||
|
case "obi":
|
||||||
|
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqOBIHeader))
|
||||||
|
default:
|
||||||
|
opts = append(opts, obiformats.OptionsFastSeqHeaderFormat(obiformats.FormatFastSeqJsonHeader))
|
||||||
|
}
|
||||||
|
|
||||||
|
nworkers := obioptions.CLIParallelWorkers() / 4
|
||||||
|
if nworkers < 2 {
|
||||||
|
nworkers = 2
|
||||||
|
}
|
||||||
|
|
||||||
|
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
||||||
|
opts = append(opts, obiformats.OptionsBatchSize(obioptions.CLIBatchSize()))
|
||||||
|
|
||||||
|
opts = append(opts, obiformats.OptionsQualityShift(obiconvert.CLIOutputQualityShift()))
|
||||||
|
|
||||||
|
opts = append(opts, obiformats.OptionsCompressed(obiconvert.CLICompressed()))
|
||||||
|
|
||||||
|
var err error
|
||||||
|
|
||||||
|
switch obiconvert.CLIOutputFormat() {
|
||||||
|
case "fastq":
|
||||||
|
newIter, err = obiformats.WriteFastqToFile(idb, _SaveRefDB, opts...)
|
||||||
|
case "fasta":
|
||||||
|
newIter, err = obiformats.WriteFastaToFile(idb, _SaveRefDB, opts...)
|
||||||
|
default:
|
||||||
|
newIter, err = obiformats.WriteSequencesToFile(idb, _SaveRefDB, opts...)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Write file error: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
newIter.Recycle()
|
||||||
|
obiiter.WaitForLastPipe()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func CLIRunExact() bool {
|
func CLIRunExact() bool {
|
||||||
return _RunExact
|
return _RunExact
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user