mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-02-02 22:30:34 +00:00
Merge pull request #68 from metabarcoding/push-rrulynolpprl
Push rrulynolpprl
This commit is contained in:
@@ -98,6 +98,102 @@ else
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obiuniq "${TEST_DIR}/touniq.fasta" \
|
||||
> "${TMPDIR}/touniq_u.fasta"
|
||||
then
|
||||
log "OBIUniq simple: running OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIUniq simple: running failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
obicsv -s --auto ${TEST_DIR}/touniq_u.fasta \
|
||||
| tail -n +2 \
|
||||
| sort \
|
||||
> "${TMPDIR}/touniq_u_ref.csv"
|
||||
|
||||
obicsv -s --auto ${TMPDIR}/touniq_u.fasta \
|
||||
| tail -n +2 \
|
||||
| sort \
|
||||
> "${TMPDIR}/touniq_u.csv"
|
||||
|
||||
((ntest++))
|
||||
if diff "${TMPDIR}/touniq_u_ref.csv" \
|
||||
"${TMPDIR}/touniq_u.csv" > /dev/null
|
||||
then
|
||||
log "OBIUniq simple: result OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIUniq simple: result failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obiuniq -c a "${TEST_DIR}/touniq.fasta" \
|
||||
> "${TMPDIR}/touniq_u_a.fasta"
|
||||
then
|
||||
log "OBIUniq one category: running OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIUniq one category: running failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
obicsv -s --auto ${TEST_DIR}/touniq_u_a.fasta \
|
||||
| tail -n +2 \
|
||||
| sort \
|
||||
> "${TMPDIR}/touniq_u_a_ref.csv"
|
||||
|
||||
obicsv -s --auto ${TMPDIR}/touniq_u_a.fasta \
|
||||
| tail -n +2 \
|
||||
| sort \
|
||||
> "${TMPDIR}/touniq_u_a.csv"
|
||||
|
||||
|
||||
((ntest++))
|
||||
if diff "${TMPDIR}/touniq_u_a_ref.csv" \
|
||||
"${TMPDIR}/touniq_u_a.csv" > /dev/null
|
||||
then
|
||||
log "OBIUniq one category: result OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIUniq one category: result failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obiuniq -c a -c b "${TEST_DIR}/touniq.fasta" \
|
||||
> "${TMPDIR}/touniq_u_a_b.fasta"
|
||||
then
|
||||
log "OBIUniq two categories: running OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIUniq two categories: running failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
obicsv -s --auto ${TEST_DIR}/touniq_u_a_b.fasta \
|
||||
| tail -n +2 \
|
||||
| sort \
|
||||
> "${TMPDIR}/touniq_u_a_b_ref.csv"
|
||||
|
||||
obicsv -s --auto ${TMPDIR}/touniq_u_a_b.fasta \
|
||||
| tail -n +2 \
|
||||
| sort \
|
||||
> "${TMPDIR}/touniq_u_a_b.csv"
|
||||
|
||||
((ntest++))
|
||||
if diff "${TMPDIR}/touniq_u_a_b_ref.csv" \
|
||||
"${TMPDIR}/touniq_u_a_b.csv" > /dev/null
|
||||
then
|
||||
log "OBIUniq two categories: result OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIUniq two categories: result failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
#########################################
|
||||
#
|
||||
|
||||
16
obitests/obitools/obiuniq/touniq.fasta
Normal file
16
obitests/obitools/obiuniq/touniq.fasta
Normal file
@@ -0,0 +1,16 @@
|
||||
>seq1 {"a":2, "b":4,"c":5}
|
||||
aaacccgggttt
|
||||
>seq2 {"a":3, "b":4,"c":5}
|
||||
aaacccgggttt
|
||||
>seq3 {"a":3, "b":5,"c":5}
|
||||
aaacccgggttt
|
||||
>seq4 {"a":3, "b":5,"c":6}
|
||||
aaacccgggttt
|
||||
>seq5 {"a":2, "b":4,"c":5}
|
||||
aaacccgggtttca
|
||||
>seq6 {"a":3, "b":4,"c":5}
|
||||
aaacccgggtttca
|
||||
>seq7 {"a":3, "b":5,"c":5}
|
||||
aaacccgggtttca
|
||||
>seq8 {"a":3, "b":5,"c":6}
|
||||
aaacccgggtttca
|
||||
4
obitests/obitools/obiuniq/touniq_u.fasta
Normal file
4
obitests/obitools/obiuniq/touniq_u.fasta
Normal file
@@ -0,0 +1,4 @@
|
||||
>seq5 {"count":4}
|
||||
aaacccgggtttca
|
||||
>seq1 {"count":4}
|
||||
aaacccgggttt
|
||||
8
obitests/obitools/obiuniq/touniq_u_a.fasta
Normal file
8
obitests/obitools/obiuniq/touniq_u_a.fasta
Normal file
@@ -0,0 +1,8 @@
|
||||
>seq5 {"a":2,"b":4,"c":5,"count":1}
|
||||
aaacccgggtttca
|
||||
>seq6 {"a":3,"count":3}
|
||||
aaacccgggtttca
|
||||
>seq1 {"a":2,"b":4,"c":5,"count":1}
|
||||
aaacccgggttt
|
||||
>seq2 {"a":3,"count":3}
|
||||
aaacccgggttt
|
||||
12
obitests/obitools/obiuniq/touniq_u_a_b.fasta
Normal file
12
obitests/obitools/obiuniq/touniq_u_a_b.fasta
Normal file
@@ -0,0 +1,12 @@
|
||||
>seq5 {"a":2,"b":4,"c":5,"count":1}
|
||||
aaacccgggtttca
|
||||
>seq6 {"a":3,"b":4,"c":5,"count":1}
|
||||
aaacccgggtttca
|
||||
>seq7 {"a":3,"b":5,"count":2}
|
||||
aaacccgggtttca
|
||||
>seq1 {"a":2,"b":4,"c":5,"count":1}
|
||||
aaacccgggttt
|
||||
>seq2 {"a":3,"b":4,"c":5,"count":1}
|
||||
aaacccgggttt
|
||||
>seq3 {"a":3,"b":5,"count":2}
|
||||
aaacccgggttt
|
||||
@@ -7,11 +7,15 @@ import (
|
||||
|
||||
func ISequenceChunk(iterator obiiter.IBioSequence,
|
||||
classifier *obiseq.BioSequenceClassifier,
|
||||
onMemory bool) (obiiter.IBioSequence, error) {
|
||||
onMemory bool,
|
||||
dereplicate bool,
|
||||
na string,
|
||||
statsOn obiseq.StatsOnDescriptions,
|
||||
) (obiiter.IBioSequence, error) {
|
||||
|
||||
if onMemory {
|
||||
return ISequenceChunkOnMemory(iterator, classifier)
|
||||
} else {
|
||||
return ISequenceChunkOnDisk(iterator, classifier)
|
||||
return ISequenceChunkOnDisk(iterator, classifier, dereplicate, na, statsOn)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,7 +74,11 @@ func find(root, ext string) []string {
|
||||
// is removed. The function logs the number of batches created and the processing
|
||||
// status of each batch.
|
||||
func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
||||
classifier *obiseq.BioSequenceClassifier) (obiiter.IBioSequence, error) {
|
||||
classifier *obiseq.BioSequenceClassifier,
|
||||
dereplicate bool,
|
||||
na string,
|
||||
statsOn obiseq.StatsOnDescriptions,
|
||||
) (obiiter.IBioSequence, error) {
|
||||
obiutils.RegisterAPipe()
|
||||
dir, err := tempDir()
|
||||
if err != nil {
|
||||
@@ -113,11 +117,42 @@ func ISequenceChunkOnDisk(iterator obiiter.IBioSequence,
|
||||
panic(err)
|
||||
}
|
||||
|
||||
source, chunk := iseq.Load()
|
||||
if dereplicate {
|
||||
u := make(map[string]*obiseq.BioSequence)
|
||||
var source string
|
||||
var chunk obiseq.BioSequenceSlice
|
||||
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(source, order, chunk))
|
||||
log.Infof("Start processing of batch %d/%d : %d sequences",
|
||||
order, nbatch, len(chunk))
|
||||
for iseq.Next() {
|
||||
batch := iseq.Get()
|
||||
source = batch.Source()
|
||||
|
||||
for _, seq := range batch.Slice() {
|
||||
sstring := seq.String()
|
||||
prev, ok := u[sstring]
|
||||
if ok {
|
||||
prev.Merge(seq, na, true, statsOn)
|
||||
} else {
|
||||
u[sstring] = seq
|
||||
}
|
||||
}
|
||||
|
||||
chunk = obiseq.MakeBioSequenceSlice(len(u))
|
||||
i := 0
|
||||
|
||||
for _, seq := range u {
|
||||
chunk[i] = seq
|
||||
}
|
||||
|
||||
}
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(source, order, chunk))
|
||||
|
||||
} else {
|
||||
source, chunk := iseq.Load()
|
||||
|
||||
newIter.Push(obiiter.MakeBioSequenceBatch(source, order, chunk))
|
||||
log.Infof("Start processing of batch %d/%d : %d sequences",
|
||||
order+1, nbatch, len(chunk))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -25,18 +25,32 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
||||
|
||||
log.Infoln("Starting data splitting")
|
||||
|
||||
cat := opts.Categories()
|
||||
na := opts.NAValue()
|
||||
|
||||
var classifier *obiseq.BioSequenceClassifier
|
||||
|
||||
if len(cat) > 0 {
|
||||
cls := make([]*obiseq.BioSequenceClassifier, len(cat)+1)
|
||||
for i, c := range cat {
|
||||
cls[i+1] = obiseq.AnnotationClassifier(c, na)
|
||||
}
|
||||
cls[0] = obiseq.HashClassifier(opts.BatchCount())
|
||||
classifier = obiseq.CompositeClassifier(cls...)
|
||||
} else {
|
||||
classifier = obiseq.HashClassifier(opts.BatchCount())
|
||||
}
|
||||
|
||||
if opts.SortOnDisk() {
|
||||
nworkers = 1
|
||||
iterator, err = ISequenceChunkOnDisk(iterator,
|
||||
obiseq.HashClassifier(opts.BatchCount()))
|
||||
iterator, err = ISequenceChunkOnDisk(iterator, classifier, true, na, opts.StatsOn())
|
||||
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
} else {
|
||||
iterator, err = ISequenceChunkOnMemory(iterator,
|
||||
obiseq.HashClassifier(opts.BatchCount()))
|
||||
iterator, err = ISequenceChunkOnMemory(iterator, classifier)
|
||||
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
@@ -63,63 +77,25 @@ func IUniqueSequence(iterator obiiter.IBioSequence,
|
||||
return neworder
|
||||
}
|
||||
|
||||
var ff func(obiiter.IBioSequence,
|
||||
*obiseq.BioSequenceClassifier,
|
||||
int)
|
||||
|
||||
cat := opts.Categories()
|
||||
na := opts.NAValue()
|
||||
|
||||
ff = func(input obiiter.IBioSequence,
|
||||
classifier *obiseq.BioSequenceClassifier,
|
||||
icat int) {
|
||||
icat--
|
||||
ff := func(input obiiter.IBioSequence,
|
||||
classifier *obiseq.BioSequenceClassifier) {
|
||||
input, err = ISequenceSubChunk(input,
|
||||
classifier,
|
||||
1)
|
||||
|
||||
var next obiiter.IBioSequence
|
||||
if icat >= 0 {
|
||||
next = obiiter.MakeIBioSequence()
|
||||
|
||||
iUnique.Add(1)
|
||||
|
||||
go ff(next,
|
||||
obiseq.AnnotationClassifier(cat[icat], na),
|
||||
icat)
|
||||
}
|
||||
|
||||
o := 0
|
||||
for input.Next() {
|
||||
batch := input.Get()
|
||||
|
||||
if icat < 0 || len(batch.Slice()) == 1 {
|
||||
// No more sub classification of sequence or only a single sequence
|
||||
if !(opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1) {
|
||||
iUnique.Push(batch.Reorder(nextOrder()))
|
||||
}
|
||||
} else {
|
||||
// A new step of classification must du realized
|
||||
next.Push(batch.Reorder(o))
|
||||
o++
|
||||
if !(opts.NoSingleton() && len(batch.Slice()) == 1 && batch.Slice()[0].Count() == 1) {
|
||||
iUnique.Push(batch.Reorder(nextOrder()))
|
||||
}
|
||||
}
|
||||
|
||||
if icat >= 0 {
|
||||
next.Close()
|
||||
}
|
||||
|
||||
iUnique.Done()
|
||||
}
|
||||
|
||||
for i := 0; i < nworkers-1; i++ {
|
||||
go ff(iterator.Split(),
|
||||
obiseq.SequenceClassifier(),
|
||||
len(cat))
|
||||
go ff(iterator.Split(), obiseq.SequenceClassifier())
|
||||
}
|
||||
go ff(iterator,
|
||||
obiseq.SequenceClassifier(),
|
||||
len(cat))
|
||||
go ff(iterator, obiseq.SequenceClassifier())
|
||||
|
||||
iMerged := iUnique.IMergeSequenceBatch(opts.NAValue(),
|
||||
opts.StatsOn(),
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
|
||||
var _Commit = "c1b9503"
|
||||
var _Commit = "547135c"
|
||||
var _Version = "Release 4.4.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
||||
@@ -316,3 +316,77 @@ func RotateClassifier(size int) *BioSequenceClassifier {
|
||||
c := BioSequenceClassifier{code, value, reset, clone, "RotateClassifier"}
|
||||
return &c
|
||||
}
|
||||
|
||||
func CompositeClassifier(classifiers ...*BioSequenceClassifier) *BioSequenceClassifier {
|
||||
encode := make(map[string]int, 1000)
|
||||
decode := make([]string, 0, 1000)
|
||||
locke := sync.RWMutex{}
|
||||
maxcode := 0
|
||||
initbufsize := len(classifiers) * 6
|
||||
|
||||
code := func(sequence *BioSequence) int {
|
||||
buf := make([]byte, 0, initbufsize)
|
||||
for _, cl := range classifiers {
|
||||
if cl == nil {
|
||||
continue
|
||||
}
|
||||
rep := cl.Code(sequence)
|
||||
buf = strconv.AppendInt(buf, int64(rep), 10)
|
||||
buf = append(buf, ':')
|
||||
}
|
||||
|
||||
locke.Lock()
|
||||
defer locke.Unlock()
|
||||
|
||||
sval := string(buf)
|
||||
|
||||
k, ok := encode[sval]
|
||||
|
||||
if !ok {
|
||||
k = maxcode
|
||||
maxcode++
|
||||
encode[sval] = k
|
||||
decode = append(decode, sval)
|
||||
}
|
||||
|
||||
return k
|
||||
}
|
||||
|
||||
value := func(k int) string {
|
||||
locke.RLock()
|
||||
defer locke.RUnlock()
|
||||
|
||||
if k >= maxcode {
|
||||
log.Fatalf("value %d not register", k)
|
||||
}
|
||||
return decode[k]
|
||||
}
|
||||
|
||||
reset := func() {
|
||||
locke.Lock()
|
||||
defer locke.Unlock()
|
||||
|
||||
encode = make(map[string]int)
|
||||
decode = decode[:0]
|
||||
maxcode = 0
|
||||
}
|
||||
|
||||
clone := func() *BioSequenceClassifier {
|
||||
clones := make([]*BioSequenceClassifier, 0, len(classifiers))
|
||||
for _, cl := range classifiers {
|
||||
if cl == nil {
|
||||
continue
|
||||
}
|
||||
if cl.Clone != nil {
|
||||
clones = append(clones, cl.Clone())
|
||||
} else {
|
||||
clones = append(clones, cl)
|
||||
}
|
||||
}
|
||||
return CompositeClassifier(clones...)
|
||||
}
|
||||
|
||||
c := BioSequenceClassifier{code, value, reset, clone, "CompositeClassifier"}
|
||||
return &c
|
||||
|
||||
}
|
||||
|
||||
@@ -134,12 +134,12 @@ func CutSequenceWorker(from, to int, breakOnError bool) obiseq.SeqWorker {
|
||||
t = to
|
||||
}
|
||||
|
||||
if from < 0 {
|
||||
from = 0
|
||||
if f < 0 {
|
||||
f = 0
|
||||
}
|
||||
|
||||
if to >= s.Len() {
|
||||
to = s.Len()
|
||||
if t >= s.Len() {
|
||||
t = s.Len()
|
||||
}
|
||||
|
||||
rep, err := s.Subsequence(f, t, false)
|
||||
@@ -147,7 +147,7 @@ func CutSequenceWorker(from, to int, breakOnError bool) obiseq.SeqWorker {
|
||||
if breakOnError {
|
||||
log.Fatalf("Cannot cut sequence %s (%v)", s.Id(), err)
|
||||
} else {
|
||||
err = fmt.Errorf("Cannot cut sequence %s (%v), sequence discarded", s.Id(), err)
|
||||
err = fmt.Errorf("cannot cut sequence %s (%v), sequence discarded", s.Id(), err)
|
||||
}
|
||||
}
|
||||
return obiseq.BioSequenceSlice{rep}, err
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package obiannotate
|
||||
|
||||
import (
|
||||
"math"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -266,6 +267,7 @@ func CLICut() (int, int) {
|
||||
return 0, 0
|
||||
}
|
||||
values := strings.Split(_cut, ":")
|
||||
log.Warnf("values: %v (%d-%d)", values, len(values), len(values[1]))
|
||||
|
||||
if len(values) != 2 {
|
||||
log.Fatalf("Invalid cut value %s. value should be of the form start:end", _cut)
|
||||
@@ -274,12 +276,20 @@ func CLICut() (int, int) {
|
||||
start, err := strconv.Atoi(values[0])
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Invalid cut value %s. value %s should be an integer", _cut, values[0])
|
||||
if len(values[0]) == 0 {
|
||||
start = 1
|
||||
} else {
|
||||
log.Fatalf("Invalid start cut value %s. value %s should be an integer", _cut, values[0])
|
||||
}
|
||||
}
|
||||
end, err := strconv.Atoi(values[1])
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Invalid cut value %s. value %s should be an integer", _cut, values[1])
|
||||
if len(values[1]) == 0 {
|
||||
end = math.MaxInt
|
||||
} else {
|
||||
log.Fatalf("Invalid end cut value %s. value %s should be an integer", _cut, values[1])
|
||||
}
|
||||
}
|
||||
|
||||
return start, end
|
||||
|
||||
Reference in New Issue
Block a user