mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-06-24 17:51:00 +00:00
Compare commits
16 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| e9210e28a3 | |||
| 13a93fce11 | |||
| 14064c919e | |||
| 1dfd68aa6d | |||
| 930fe5f1ba | |||
| dcdaf9e372 | |||
| af7ae3d60c | |||
| cecf90fa40 | |||
| a186bd1c92 | |||
| 46d60c1a44 | |||
| 6c4a6c697c | |||
| 60b3753673 | |||
| 14e2840a2d | |||
| 42910c7db9 | |||
| 8b4cf677c6 | |||
| 02765f154f |
@@ -37,6 +37,11 @@ func main() {
|
|||||||
|
|
||||||
optionParser(os.Args)
|
optionParser(os.Args)
|
||||||
|
|
||||||
|
if !obipairing.CLIHasPairedFiles() {
|
||||||
|
log.Error("You must provide both a forward file (-F) and a reverse file (-R)")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
obidefault.SetStrictReadWorker(2)
|
obidefault.SetStrictReadWorker(2)
|
||||||
obidefault.SetStrictWriteWorker(2)
|
obidefault.SetStrictWriteWorker(2)
|
||||||
pairs, err := obipairing.CLIPairedSequence()
|
pairs, err := obipairing.CLIPairedSequence()
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
@@ -8,6 +9,7 @@ import (
|
|||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||||
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obimultiplex"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obipairing"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obipairing"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitagpcr"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitagpcr"
|
||||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||||
@@ -39,6 +41,17 @@ func main() {
|
|||||||
obitagpcr.OptionSet)
|
obitagpcr.OptionSet)
|
||||||
|
|
||||||
optionParser(os.Args)
|
optionParser(os.Args)
|
||||||
|
|
||||||
|
if obimultiplex.CLIAskConfigTemplate() {
|
||||||
|
fmt.Print(obimultiplex.CLIConfigTemplate())
|
||||||
|
os.Exit(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !obipairing.CLIHasPairedFiles() {
|
||||||
|
log.Error("You must provide both a forward file (-F) and a reverse file (-R)")
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
pairs, err := obipairing.CLIPairedSequence()
|
pairs, err := obipairing.CLIPairedSequence()
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -0,0 +1,24 @@
|
|||||||
|
>HELIUM_000100422_612GNAAXX:7:118:3572:14633#0/1_sub[28..126] {"count":10172,"merged_sample":{"26a_F040644":10172},"obitag_bestid":0.9797979797979798,"obitag_bestmatch":"AY227529","obitag_match_count":1,"obitag_rank":"genus","obitag_similarity_method":"lcs","taxid":"taxon:9992 [Marmota]@genus"}
|
||||||
|
ttagccctaaacataaacattcaataaacaagaatgttcgccagagtactactagcaaca
|
||||||
|
gcctgaaactcaaaggacttggcggtgctttacatccct
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:99:9351:13090#0/1_sub[28..127] {"count":260,"merged_sample":{"29a_F260619":260},"obitag_bestid":0.9405940594059405,"obitag_bestmatch":"AF154263","obitag_match_count":9,"obitag_rank":"infraorder","obitag_similarity_method":"lcs","taxid":"taxon:35500 [Pecora]@infraorder"}
|
||||||
|
ttagccctaaacacaaataattacacaaacaaaattgttcaccagagtactagcggcaac
|
||||||
|
agcttaaaactcaaaggacttggcggtgctttataccctt
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:108:10111:9078#0/1_sub[28..127] {"count":7146,"merged_sample":{"13a_F730603":7146},"obitag_bestid":1,"obitag_bestmatch":"AB245427","obitag_match_count":1,"obitag_rank":"species","obitag_similarity_method":"lcs","taxid":"taxon:9860 [Cervus elaphus]@species"}
|
||||||
|
ctagccttaaacacaaatagttatgcaaacaaaactattcgccagagtactaccggcaat
|
||||||
|
agcttaaaactcaaaggacttggcggtgctttataccctt
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:38:14204:12725#0/1_sub[28..126] {"count":87,"merged_sample":{"26a_F040644":87},"obitag_bestid":0.9494949494949495,"obitag_bestmatch":"AY227530","obitag_match_count":2,"obitag_rank":"tribe","obitag_similarity_method":"lcs","taxid":"taxon:337730 [Marmotini]@tribe"}
|
||||||
|
ttagccctaaacataaacattcaataaacaagaatgttcgccagaggactactagcaata
|
||||||
|
gcttaaaactcaaaggacttggcggtgctttatatccct
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:30:9942:4495#0/1_sub[28..126] {"count":95,"merged_sample":{"26a_F040644":11,"29a_F260619":84},"obitag_bestid":0.9595959595959596,"obitag_bestmatch":"AC187326","obitag_match_count":1,"obitag_rank":"subspecies","obitag_similarity_method":"lcs","taxid":"taxon:9615 [Canis lupus familiaris]@subspecies"}
|
||||||
|
ttagccctaaacataagctattccataacaaaataattcgccagagaactactagcaaca
|
||||||
|
gattaaacctcaaaggacttggcagtgctttatacccct
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:51:16702:19393#0/1_sub[28..127] {"count":12004,"merged_sample":{"15a_F730814":7465,"29a_F260619":4539},"obitag_bestid":1,"obitag_bestmatch":"AJ885202","obitag_match_count":1,"obitag_rank":"species","obitag_similarity_method":"lcs","taxid":"taxon:9858 [Capreolus capreolus]@species"}
|
||||||
|
ttagccctaaacacaagtaattaatataacaaaattattcgccagagtactaccggcaat
|
||||||
|
agcttaaaactcaaaggacttggcggtgctttataccctt
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:84:14502:1617#0/1_sub[28..127] {"count":319,"merged_sample":{"29a_F260619":319},"obitag_bestid":1,"obitag_bestmatch":"AJ972683","obitag_match_count":1,"obitag_rank":"species","obitag_similarity_method":"lcs","taxid":"taxon:9858 [Capreolus capreolus]@species"}
|
||||||
|
ttagccctaaacacaagtaattattataacaaaattattcgccagagtactaccggcaat
|
||||||
|
agcttaaaactcaaaggacttggcggtgctttataccctt
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:50:10637:6527#0/1_sub[28..126] {"count":366,"merged_sample":{"13a_F730603":13,"15a_F730814":5,"26a_F040644":347,"29a_F260619":1},"obitag_bestid":1,"obitag_bestmatch":"AB048590","obitag_match_count":1,"obitag_rank":"genus","obitag_similarity_method":"lcs","taxid":"taxon:9611 [Canis]@genus"}
|
||||||
|
ttagccctaaacatagataattttacaacaaaataattcgccagaggactactagcaata
|
||||||
|
gcttaaaactcaaaggacttggcggtgctttatatccct
|
||||||
@@ -0,0 +1,48 @@
|
|||||||
|
taxid,parent,taxonomic_rank,scientific_name
|
||||||
|
taxon:1 [root]@no rank,taxon:1 [root]@no rank,no rank,root
|
||||||
|
taxon:131567 [cellular organisms]@cellular root,taxon:1 [root]@no rank,cellular root,cellular organisms
|
||||||
|
taxon:2759 [Eukaryota]@domain,taxon:131567 [cellular organisms]@cellular root,domain,Eukaryota
|
||||||
|
taxon:33154 [Opisthokonta]@clade,taxon:2759 [Eukaryota]@domain,clade,Opisthokonta
|
||||||
|
taxon:33208 [Metazoa]@kingdom,taxon:33154 [Opisthokonta]@clade,kingdom,Metazoa
|
||||||
|
taxon:6072 [Eumetazoa]@clade,taxon:33208 [Metazoa]@kingdom,clade,Eumetazoa
|
||||||
|
taxon:33213 [Bilateria]@clade,taxon:6072 [Eumetazoa]@clade,clade,Bilateria
|
||||||
|
taxon:33511 [Deuterostomia]@clade,taxon:33213 [Bilateria]@clade,clade,Deuterostomia
|
||||||
|
taxon:7711 [Chordata]@phylum,taxon:33511 [Deuterostomia]@clade,phylum,Chordata
|
||||||
|
taxon:89593 [Craniata]@subphylum,taxon:7711 [Chordata]@phylum,subphylum,Craniata
|
||||||
|
taxon:7742 [Vertebrata]@clade,taxon:89593 [Craniata]@subphylum,clade,Vertebrata
|
||||||
|
taxon:7776 [Gnathostomata]@clade,taxon:7742 [Vertebrata]@clade,clade,Gnathostomata
|
||||||
|
taxon:117570 [Teleostomi]@clade,taxon:7776 [Gnathostomata]@clade,clade,Teleostomi
|
||||||
|
taxon:117571 [Euteleostomi]@clade,taxon:117570 [Teleostomi]@clade,clade,Euteleostomi
|
||||||
|
taxon:8287 [Sarcopterygii]@superclass,taxon:117571 [Euteleostomi]@clade,superclass,Sarcopterygii
|
||||||
|
taxon:1338369 [Dipnotetrapodomorpha]@clade,taxon:8287 [Sarcopterygii]@superclass,clade,Dipnotetrapodomorpha
|
||||||
|
taxon:32523 [Tetrapoda]@clade,taxon:1338369 [Dipnotetrapodomorpha]@clade,clade,Tetrapoda
|
||||||
|
taxon:32524 [Amniota]@clade,taxon:32523 [Tetrapoda]@clade,clade,Amniota
|
||||||
|
taxon:40674 [Mammalia]@class,taxon:32524 [Amniota]@clade,class,Mammalia
|
||||||
|
taxon:32525 [Theria]@clade,taxon:40674 [Mammalia]@class,clade,Theria
|
||||||
|
taxon:9347 [Eutheria]@clade,taxon:32525 [Theria]@clade,clade,Eutheria
|
||||||
|
taxon:1437010 [Boreoeutheria]@clade,taxon:9347 [Eutheria]@clade,clade,Boreoeutheria
|
||||||
|
taxon:314146 [Euarchontoglires]@superorder,taxon:1437010 [Boreoeutheria]@clade,superorder,Euarchontoglires
|
||||||
|
taxon:314145 [Laurasiatheria]@superorder,taxon:1437010 [Boreoeutheria]@clade,superorder,Laurasiatheria
|
||||||
|
taxon:33554 [Carnivora]@order,taxon:314145 [Laurasiatheria]@superorder,order,Carnivora
|
||||||
|
taxon:91561 [Artiodactyla]@order,taxon:314145 [Laurasiatheria]@superorder,order,Artiodactyla
|
||||||
|
taxon:314147 [Glires]@clade,taxon:314146 [Euarchontoglires]@superorder,clade,Glires
|
||||||
|
taxon:9845 [Ruminantia]@suborder,taxon:91561 [Artiodactyla]@order,suborder,Ruminantia
|
||||||
|
taxon:35500 [Pecora]@infraorder,taxon:9845 [Ruminantia]@suborder,infraorder,Pecora
|
||||||
|
taxon:9989 [Rodentia]@order,taxon:314147 [Glires]@clade,order,Rodentia
|
||||||
|
taxon:379584 [Caniformia]@suborder,taxon:33554 [Carnivora]@order,suborder,Caniformia
|
||||||
|
taxon:9608 [Canidae]@family,taxon:379584 [Caniformia]@suborder,family,Canidae
|
||||||
|
taxon:9850 [Cervidae]@family,taxon:35500 [Pecora]@infraorder,family,Cervidae
|
||||||
|
taxon:9881 [Odocoileinae]@subfamily,taxon:9850 [Cervidae]@family,subfamily,Odocoileinae
|
||||||
|
taxon:33553 [Sciuromorpha]@suborder,taxon:9989 [Rodentia]@order,suborder,Sciuromorpha
|
||||||
|
taxon:55153 [Sciuridae]@family,taxon:33553 [Sciuromorpha]@suborder,family,Sciuridae
|
||||||
|
taxon:34878 [Cervinae]@subfamily,taxon:9850 [Cervidae]@family,subfamily,Cervinae
|
||||||
|
taxon:9611 [Canis]@genus,taxon:9608 [Canidae]@family,genus,Canis
|
||||||
|
taxon:9857 [Capreolus]@genus,taxon:9881 [Odocoileinae]@subfamily,genus,Capreolus
|
||||||
|
taxon:9612 [Canis lupus]@species,taxon:9611 [Canis]@genus,species,Canis lupus
|
||||||
|
taxon:337726 [Xerinae]@subfamily,taxon:55153 [Sciuridae]@family,subfamily,Xerinae
|
||||||
|
taxon:9859 [Cervus]@genus,taxon:34878 [Cervinae]@subfamily,genus,Cervus
|
||||||
|
taxon:337730 [Marmotini]@tribe,taxon:337726 [Xerinae]@subfamily,tribe,Marmotini
|
||||||
|
taxon:9992 [Marmota]@genus,taxon:337730 [Marmotini]@tribe,genus,Marmota
|
||||||
|
taxon:9860 [Cervus elaphus]@species,taxon:9859 [Cervus]@genus,species,Cervus elaphus
|
||||||
|
taxon:9615 [Canis lupus familiaris]@subspecies,taxon:9612 [Canis lupus]@species,subspecies,Canis lupus familiaris
|
||||||
|
taxon:9858 [Capreolus capreolus]@species,taxon:9857 [Capreolus]@genus,species,Capreolus capreolus
|
||||||
|
@@ -44,7 +44,7 @@ cleanup() {
|
|||||||
rm -rf "$TMPDIR" # Suppress the temporary directory
|
rm -rf "$TMPDIR" # Suppress the temporary directory
|
||||||
|
|
||||||
if [ $failed -gt 0 ]; then
|
if [ $failed -gt 0 ]; then
|
||||||
log "$TEST_NAME tests failed"
|
log "$TEST_NAME tests failed"
|
||||||
log
|
log
|
||||||
log
|
log
|
||||||
exit 1
|
exit 1
|
||||||
@@ -60,10 +60,10 @@ log() {
|
|||||||
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
||||||
}
|
}
|
||||||
|
|
||||||
log "Testing $TEST_NAME..."
|
log "Testing $TEST_NAME..."
|
||||||
log "Test directory is $TEST_DIR"
|
log "Test directory is $TEST_DIR"
|
||||||
log "obitools directory is $OBITOOLS_DIR"
|
log "obitools directory is $OBITOOLS_DIR"
|
||||||
log "Temporary directory is $TMPDIR"
|
log "Temporary directory is $TMPDIR"
|
||||||
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||||
|
|
||||||
######################################################################
|
######################################################################
|
||||||
@@ -94,12 +94,12 @@ log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
|||||||
|
|
||||||
|
|
||||||
((ntest++))
|
((ntest++))
|
||||||
if $CMD -h > "${TMPDIR}/help.txt" 2>&1
|
if $CMD -h > "${TMPDIR}/help.txt" 2>&1
|
||||||
then
|
then
|
||||||
log "$MCMD: printing help OK"
|
log "$MCMD: printing help OK"
|
||||||
((success++))
|
((success++))
|
||||||
else
|
else
|
||||||
log "$MCMD: printing help failed"
|
log "$MCMD: printing help failed"
|
||||||
((failed++))
|
((failed++))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -108,15 +108,15 @@ fi
|
|||||||
if obiconvert -Z "${TEST_DIR}/gbpln1088.4Mb.fasta.gz" \
|
if obiconvert -Z "${TEST_DIR}/gbpln1088.4Mb.fasta.gz" \
|
||||||
> "${TMPDIR}/xxx.fasta.gz" && \
|
> "${TMPDIR}/xxx.fasta.gz" && \
|
||||||
zdiff "${TEST_DIR}/gbpln1088.4Mb.fasta.gz" \
|
zdiff "${TEST_DIR}/gbpln1088.4Mb.fasta.gz" \
|
||||||
"${TMPDIR}/xxx.fasta.gz"
|
"${TMPDIR}/xxx.fasta.gz"
|
||||||
then
|
then
|
||||||
log "$MCMD: converting large fasta file to fasta OK"
|
log "$MCMD: converting large fasta file to fasta OK"
|
||||||
((success++))
|
((success++))
|
||||||
else
|
else
|
||||||
log "$MCMD: converting large fasta file to fasta failed"
|
log "$MCMD: converting large fasta file to fasta failed"
|
||||||
((failed++))
|
((failed++))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
((ntest++))
|
((ntest++))
|
||||||
if obiconvert -Z --fastq-output \
|
if obiconvert -Z --fastq-output \
|
||||||
"${TEST_DIR}/gbpln1088.4Mb.fasta.gz" \
|
"${TEST_DIR}/gbpln1088.4Mb.fasta.gz" \
|
||||||
@@ -125,15 +125,139 @@ if obiconvert -Z --fastq-output \
|
|||||||
"${TMPDIR}/xxx.fastq.gz" \
|
"${TMPDIR}/xxx.fastq.gz" \
|
||||||
> "${TMPDIR}/yyy.fasta.gz" && \
|
> "${TMPDIR}/yyy.fasta.gz" && \
|
||||||
zdiff "${TEST_DIR}/gbpln1088.4Mb.fasta.gz" \
|
zdiff "${TEST_DIR}/gbpln1088.4Mb.fasta.gz" \
|
||||||
"${TMPDIR}/yyy.fasta.gz"
|
"${TMPDIR}/yyy.fasta.gz"
|
||||||
then
|
then
|
||||||
log "$MCMD: converting large file between fasta and fastq OK"
|
log "$MCMD: converting large file between fasta and fastq OK"
|
||||||
((success++))
|
((success++))
|
||||||
else
|
else
|
||||||
log "$MCMD: converting large file between fasta and fastq failed"
|
log "$MCMD: converting large file between fasta and fastq failed"
|
||||||
((failed++))
|
((failed++))
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# --raw-taxid tests (no taxonomy loaded)
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Running test
|
||||||
|
((ntest++))
|
||||||
|
if obiconvert --raw-taxid "${TEST_DIR}/out_ecotag.fasta" \
|
||||||
|
> "${TMPDIR}/raw_taxid.fasta" 2>/dev/null
|
||||||
|
then
|
||||||
|
log "$MCMD --raw-taxid: running OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD --raw-taxid: running failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Taxids must be bare numbers — no full-format "taxon:ID [Name]@rank" strings
|
||||||
|
((ntest++))
|
||||||
|
if grep '"taxid"' "${TMPDIR}/raw_taxid.fasta" | grep -qv '"taxid":"[0-9][0-9]*"'
|
||||||
|
then
|
||||||
|
log "$MCMD --raw-taxid: taxid format check failed (full-format taxid found)"
|
||||||
|
((failed++))
|
||||||
|
else
|
||||||
|
log "$MCMD --raw-taxid: taxid format OK (all taxids are bare numbers)"
|
||||||
|
((success++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --raw-taxid is idempotent: piping through a second obiconvert --raw-taxid must
|
||||||
|
# produce bit-for-bit identical output.
|
||||||
|
((ntest++))
|
||||||
|
if obiconvert --raw-taxid "${TMPDIR}/raw_taxid.fasta" \
|
||||||
|
> "${TMPDIR}/raw_taxid2.fasta" 2>/dev/null
|
||||||
|
then
|
||||||
|
log "$MCMD --raw-taxid piped: running OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD --raw-taxid piped: running failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
((ntest++))
|
||||||
|
if diff "${TMPDIR}/raw_taxid.fasta" \
|
||||||
|
"${TMPDIR}/raw_taxid2.fasta" > /dev/null
|
||||||
|
then
|
||||||
|
log "$MCMD --raw-taxid piped: idempotency OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD --raw-taxid piped: idempotency failed (outputs differ)"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# --taxonomy tests (full-format taxid, no --raw-taxid)
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Running test
|
||||||
|
((ntest++))
|
||||||
|
if obiconvert --taxonomy "${TEST_DIR}/taxonomy.csv" \
|
||||||
|
"${TEST_DIR}/out_ecotag.fasta" \
|
||||||
|
> "${TMPDIR}/taxo.fasta" 2>/dev/null
|
||||||
|
then
|
||||||
|
log "$MCMD --taxonomy: running OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD --taxonomy: running failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Taxids must be in full "taxon:ID [Name]@rank" format
|
||||||
|
((ntest++))
|
||||||
|
if grep '"taxid"' "${TMPDIR}/taxo.fasta" | grep -q '"taxid":"taxon:[0-9]'
|
||||||
|
then
|
||||||
|
log "$MCMD --taxonomy: taxid format OK (full-format taxids present)"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD --taxonomy: taxid format check failed (no full-format taxid found)"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
# --raw-taxid --taxonomy tests
|
||||||
|
# ------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Running test
|
||||||
|
((ntest++))
|
||||||
|
if obiconvert --raw-taxid --taxonomy "${TEST_DIR}/taxonomy.csv" \
|
||||||
|
"${TEST_DIR}/out_ecotag.fasta" \
|
||||||
|
> "${TMPDIR}/raw_taxid_taxo.fasta" 2>/dev/null
|
||||||
|
then
|
||||||
|
log "$MCMD --raw-taxid --taxonomy: running OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD --raw-taxid --taxonomy: running failed"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Taxids must be bare numbers even when taxonomy is loaded
|
||||||
|
((ntest++))
|
||||||
|
if grep '"taxid"' "${TMPDIR}/raw_taxid_taxo.fasta" | grep -qv '"taxid":"[0-9][0-9]*"'
|
||||||
|
then
|
||||||
|
log "$MCMD --raw-taxid --taxonomy: taxid format check failed (full-format taxid found)"
|
||||||
|
((failed++))
|
||||||
|
else
|
||||||
|
log "$MCMD --raw-taxid --taxonomy: taxid format OK (all taxids are bare numbers)"
|
||||||
|
((success++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --raw-taxid with or without taxonomy must yield identical taxid values
|
||||||
|
((ntest++))
|
||||||
|
if diff <(grep '"taxid"' "${TMPDIR}/raw_taxid.fasta" | grep -o '"taxid":"[^"]*"' | sort) \
|
||||||
|
<(grep '"taxid"' "${TMPDIR}/raw_taxid_taxo.fasta" | grep -o '"taxid":"[^"]*"' | sort) \
|
||||||
|
> /dev/null
|
||||||
|
then
|
||||||
|
log "$MCMD --raw-taxid vs --raw-taxid --taxonomy: taxid values match OK"
|
||||||
|
((success++))
|
||||||
|
else
|
||||||
|
log "$MCMD --raw-taxid vs --raw-taxid --taxonomy: taxid values differ (unexpected)"
|
||||||
|
((failed++))
|
||||||
|
fi
|
||||||
|
|
||||||
|
|
||||||
#########################################
|
#########################################
|
||||||
#
|
#
|
||||||
# At the end of the tests
|
# At the end of the tests
|
||||||
|
|||||||
@@ -0,0 +1,24 @@
|
|||||||
|
>HELIUM_000100422_612GNAAXX:7:118:3572:14633#0/1_sub[28..126] {"count":10172,"merged_sample":{"26a_F040644":10172},"obitag_bestid":0.9797979797979798,"obitag_bestmatch":"AY227529","obitag_match_count":1,"obitag_rank":"genus","obitag_similarity_method":"lcs","taxid":"taxon:9992 [Marmota]@genus"}
|
||||||
|
ttagccctaaacataaacattcaataaacaagaatgttcgccagagtactactagcaaca
|
||||||
|
gcctgaaactcaaaggacttggcggtgctttacatccct
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:99:9351:13090#0/1_sub[28..127] {"count":260,"merged_sample":{"29a_F260619":260},"obitag_bestid":0.9405940594059405,"obitag_bestmatch":"AF154263","obitag_match_count":9,"obitag_rank":"infraorder","obitag_similarity_method":"lcs","taxid":"taxon:35500 [Pecora]@infraorder"}
|
||||||
|
ttagccctaaacacaaataattacacaaacaaaattgttcaccagagtactagcggcaac
|
||||||
|
agcttaaaactcaaaggacttggcggtgctttataccctt
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:108:10111:9078#0/1_sub[28..127] {"count":7146,"merged_sample":{"13a_F730603":7146},"obitag_bestid":1,"obitag_bestmatch":"AB245427","obitag_match_count":1,"obitag_rank":"species","obitag_similarity_method":"lcs","taxid":"taxon:9860 [Cervus elaphus]@species"}
|
||||||
|
ctagccttaaacacaaatagttatgcaaacaaaactattcgccagagtactaccggcaat
|
||||||
|
agcttaaaactcaaaggacttggcggtgctttataccctt
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:38:14204:12725#0/1_sub[28..126] {"count":87,"merged_sample":{"26a_F040644":87},"obitag_bestid":0.9494949494949495,"obitag_bestmatch":"AY227530","obitag_match_count":2,"obitag_rank":"tribe","obitag_similarity_method":"lcs","taxid":"taxon:337730 [Marmotini]@tribe"}
|
||||||
|
ttagccctaaacataaacattcaataaacaagaatgttcgccagaggactactagcaata
|
||||||
|
gcttaaaactcaaaggacttggcggtgctttatatccct
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:30:9942:4495#0/1_sub[28..126] {"count":95,"merged_sample":{"26a_F040644":11,"29a_F260619":84},"obitag_bestid":0.9595959595959596,"obitag_bestmatch":"AC187326","obitag_match_count":1,"obitag_rank":"subspecies","obitag_similarity_method":"lcs","taxid":"taxon:9615 [Canis lupus familiaris]@subspecies"}
|
||||||
|
ttagccctaaacataagctattccataacaaaataattcgccagagaactactagcaaca
|
||||||
|
gattaaacctcaaaggacttggcagtgctttatacccct
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:51:16702:19393#0/1_sub[28..127] {"count":12004,"merged_sample":{"15a_F730814":7465,"29a_F260619":4539},"obitag_bestid":1,"obitag_bestmatch":"AJ885202","obitag_match_count":1,"obitag_rank":"species","obitag_similarity_method":"lcs","taxid":"taxon:9858 [Capreolus capreolus]@species"}
|
||||||
|
ttagccctaaacacaagtaattaatataacaaaattattcgccagagtactaccggcaat
|
||||||
|
agcttaaaactcaaaggacttggcggtgctttataccctt
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:84:14502:1617#0/1_sub[28..127] {"count":319,"merged_sample":{"29a_F260619":319},"obitag_bestid":1,"obitag_bestmatch":"AJ972683","obitag_match_count":1,"obitag_rank":"species","obitag_similarity_method":"lcs","taxid":"taxon:9858 [Capreolus capreolus]@species"}
|
||||||
|
ttagccctaaacacaagtaattattataacaaaattattcgccagagtactaccggcaat
|
||||||
|
agcttaaaactcaaaggacttggcggtgctttataccctt
|
||||||
|
>HELIUM_000100422_612GNAAXX:7:50:10637:6527#0/1_sub[28..126] {"count":366,"merged_sample":{"13a_F730603":13,"15a_F730814":5,"26a_F040644":347,"29a_F260619":1},"obitag_bestid":1,"obitag_bestmatch":"AB048590","obitag_match_count":1,"obitag_rank":"genus","obitag_similarity_method":"lcs","taxid":"taxon:9611 [Canis]@genus"}
|
||||||
|
ttagccctaaacatagataattttacaacaaaataattcgccagaggactactagcaata
|
||||||
|
gcttaaaactcaaaggacttggcggtgctttatatccct
|
||||||
@@ -146,6 +146,65 @@ func __match__key__(text []byte) []int {
|
|||||||
return []int{} // Not a key
|
return []int{} // Not a key
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func __match__array__(text []byte) []int {
|
||||||
|
|
||||||
|
state := 0
|
||||||
|
level := 0
|
||||||
|
start := 0
|
||||||
|
instring := byte(0)
|
||||||
|
|
||||||
|
for i, r := range text {
|
||||||
|
if state == 2 {
|
||||||
|
if r == ';' {
|
||||||
|
return []int{start, i + 1}
|
||||||
|
}
|
||||||
|
if r != ' ' && r != '\t' {
|
||||||
|
return []int{}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if state == 0 {
|
||||||
|
if r == '[' {
|
||||||
|
level++
|
||||||
|
state++
|
||||||
|
start = i
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if r != ' ' && r != '\t' {
|
||||||
|
return []int{}
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// state == 1: inside the array
|
||||||
|
if instring != 0 {
|
||||||
|
if r == instring {
|
||||||
|
instring = 0
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if r == '"' || r == '\'' {
|
||||||
|
instring = r
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if r == '[' || r == '{' {
|
||||||
|
level++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if r == ']' || r == '}' {
|
||||||
|
level--
|
||||||
|
if level == 0 {
|
||||||
|
state++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return []int{}
|
||||||
|
}
|
||||||
|
|
||||||
func __match__general__(text []byte) []int {
|
func __match__general__(text []byte) []int {
|
||||||
|
|
||||||
for i, r := range text {
|
for i, r := range text {
|
||||||
@@ -242,6 +301,21 @@ func ParseOBIFeatures(text string, annotations obiseq.Annotation) string {
|
|||||||
stop = m[1] + 1
|
stop = m[1] + 1
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
// array value
|
||||||
|
m = __match__array__(part)
|
||||||
|
if len(m) > 0 {
|
||||||
|
bvalue = bytes.TrimSpace(part[m[0]:(m[1] - 1)])
|
||||||
|
j := bytes.ReplaceAll(bvalue, []byte("'"), []byte(`"`))
|
||||||
|
j = __obi_header_map_int_key__.ReplaceAll(j, []byte(`$1"$2":`))
|
||||||
|
arr, err := _parse_json_array_interface(j)
|
||||||
|
if err != nil {
|
||||||
|
value = string(bvalue)
|
||||||
|
} else {
|
||||||
|
value = arr
|
||||||
|
}
|
||||||
|
stop = m[1] + 1
|
||||||
|
} else {
|
||||||
|
|
||||||
// Generic value
|
// Generic value
|
||||||
|
|
||||||
// m = __obi_header_value_general_pattern__.FindIndex(part)
|
// m = __obi_header_value_general_pattern__.FindIndex(part)
|
||||||
@@ -264,6 +338,7 @@ func ParseOBIFeatures(text string, annotations obiseq.Annotation) string {
|
|||||||
// no value
|
// no value
|
||||||
break
|
break
|
||||||
} // End of No value
|
} // End of No value
|
||||||
|
} // End of not array
|
||||||
} // End of not dict
|
} // End of not dict
|
||||||
} // End of not string
|
} // End of not string
|
||||||
} // End of not numeric
|
} // End of not numeric
|
||||||
@@ -327,19 +402,19 @@ func WriteFastSeqOBIHeade(buffer *bytes.Buffer, sequence *obiseq.BioSequence) {
|
|||||||
buffer.WriteString(fmt.Sprintf("%s=", key))
|
buffer.WriteString(fmt.Sprintf("%s=", key))
|
||||||
buffer.Write(tv)
|
buffer.Write(tv)
|
||||||
buffer.WriteString("; ")
|
buffer.WriteString("; ")
|
||||||
case map[string]int,
|
|
||||||
map[string]string,
|
|
||||||
map[string]interface{}:
|
|
||||||
tv, err := obiutils.JsonMarshal(t)
|
|
||||||
if err != nil {
|
|
||||||
log.Fatalf("Cannot convert %v value", value)
|
|
||||||
}
|
|
||||||
tv = bytes.ReplaceAll(tv, []byte(`"`), []byte("'"))
|
|
||||||
buffer.WriteString(fmt.Sprintf("%s=", key))
|
|
||||||
buffer.Write(tv)
|
|
||||||
buffer.WriteString("; ")
|
|
||||||
default:
|
default:
|
||||||
buffer.WriteString(fmt.Sprintf("%s=%v; ", key, value))
|
if obiutils.IsAMap(value) || obiutils.IsASlice(value) || obiutils.IsAnArray(value) {
|
||||||
|
tv, err := obiutils.JsonMarshal(t)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("Cannot convert %v value", value)
|
||||||
|
}
|
||||||
|
tv = bytes.ReplaceAll(tv, []byte(`"`), []byte("'"))
|
||||||
|
buffer.WriteString(fmt.Sprintf("%s=", key))
|
||||||
|
buffer.Write(tv)
|
||||||
|
buffer.WriteString("; ")
|
||||||
|
} else {
|
||||||
|
buffer.WriteString(fmt.Sprintf("%s=%v; ", key, value))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -90,6 +90,9 @@ func FormatFastaBatch(batch obiiter.BioSequenceBatch, formater FormatHeader, ski
|
|||||||
log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
|
log.Debugf("FormatFastaBatch: #%d : %d seqs", batch.Order(), batch.Len())
|
||||||
|
|
||||||
for _, seq := range batch.Slice() {
|
for _, seq := range batch.Slice() {
|
||||||
|
if len(seq.Id()) == 0 {
|
||||||
|
log.Fatalf("Sequence identifier is empty")
|
||||||
|
}
|
||||||
if seq.Len() > 0 {
|
if seq.Len() > 0 {
|
||||||
// Write header directly into bs — no intermediate string
|
// Write header directly into bs — no intermediate string
|
||||||
bs.WriteByte('>')
|
bs.WriteByte('>')
|
||||||
|
|||||||
@@ -64,6 +64,9 @@ func FormatFastqBatch(batch obiiter.BioSequenceBatch,
|
|||||||
first := true
|
first := true
|
||||||
|
|
||||||
for _, seq := range batch.Slice() {
|
for _, seq := range batch.Slice() {
|
||||||
|
if len(seq.Id()) == 0 {
|
||||||
|
log.Fatalf("Sequence identifier is empty")
|
||||||
|
}
|
||||||
if seq.Len() > 0 {
|
if seq.Len() > 0 {
|
||||||
_formatFastq(&bs, seq, formater)
|
_formatFastq(&bs, seq, formater)
|
||||||
|
|
||||||
|
|||||||
@@ -631,9 +631,9 @@ func ReadCSVNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) {
|
|||||||
return nil, fmt.Errorf("row %d has %d columns, expected %d", len(data), len(fields), len(header))
|
return nil, fmt.Errorf("row %d has %d columns, expected %d", len(data), len(fields), len(header))
|
||||||
}
|
}
|
||||||
|
|
||||||
forward_primer := fields[forward_primerColIndex]
|
forward_primer := strings.TrimSpace(fields[forward_primerColIndex])
|
||||||
reverse_primer := fields[reverse_primerColIndex]
|
reverse_primer := strings.TrimSpace(fields[reverse_primerColIndex])
|
||||||
tags := _parseMainNGSFilterTags(fields[sample_tagColIndex])
|
tags := _parseMainNGSFilterTags(strings.TrimSpace(fields[sample_tagColIndex]))
|
||||||
|
|
||||||
marker, _ := ngsfilter.GetMarker(forward_primer, reverse_primer)
|
marker, _ := ngsfilter.GetMarker(forward_primer, reverse_primer)
|
||||||
pcr, ok := marker.GetPCR(tags.Forward, tags.Reverse)
|
pcr, ok := marker.GetPCR(tags.Forward, tags.Reverse)
|
||||||
@@ -644,8 +644,8 @@ func ReadCSVNGSFilter(reader io.Reader) (*obingslibrary.NGSLibrary, error) {
|
|||||||
i, tags.Forward, tags.Reverse, forward_primer, reverse_primer)
|
i, tags.Forward, tags.Reverse, forward_primer, reverse_primer)
|
||||||
}
|
}
|
||||||
|
|
||||||
pcr.Experiment = fields[experimentColIndex]
|
pcr.Experiment = strings.TrimSpace(fields[experimentColIndex])
|
||||||
pcr.Sample = fields[sampleColIndex]
|
pcr.Sample = strings.TrimSpace(fields[sampleColIndex])
|
||||||
|
|
||||||
if extraColumns != nil {
|
if extraColumns != nil {
|
||||||
pcr.Annotations = make(obiseq.Annotation)
|
pcr.Annotations = make(obiseq.Annotation)
|
||||||
|
|||||||
+90
-55
@@ -4,22 +4,21 @@ import "math"
|
|||||||
|
|
||||||
// KmerEntropy computes the entropy of a single encoded k-mer.
|
// KmerEntropy computes the entropy of a single encoded k-mer.
|
||||||
//
|
//
|
||||||
// The algorithm mirrors the lowmask entropy calculation: it decodes the k-mer
|
// The algorithm mirrors the Rust obiskbuilder entropy: it decodes the k-mer
|
||||||
// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
|
// to a DNA sequence, extracts all sub-words of each size from 1 to levelMax,
|
||||||
// normalizes them by circular canonical form, counts their frequencies, and
|
// normalizes them by circular canonical form, counts their frequencies, and
|
||||||
// computes Shannon entropy normalized by the maximum possible entropy.
|
// computes Shannon entropy corrected for class sizes, normalized by the
|
||||||
|
// maximum possible entropy over 4^ws raw bins.
|
||||||
// The returned value is the minimum entropy across all word sizes.
|
// The returned value is the minimum entropy across all word sizes.
|
||||||
//
|
//
|
||||||
|
// Correction for small sequences: the raw entropy H = log(N) - Σ f·log(f)/N
|
||||||
|
// under-estimates the true complexity when many raw words collapse to the same
|
||||||
|
// canonical form. Adding Σ f·log(class_size)/N recovers the entropy of the
|
||||||
|
// underlying uncollapsed distribution (assuming uniform mixing within each
|
||||||
|
// equivalence class).
|
||||||
|
//
|
||||||
// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
|
// A value close to 0 indicates very low complexity (e.g. "AAAA..."),
|
||||||
// while a value close to 1 indicates high complexity.
|
// while a value close to 1 indicates high complexity.
|
||||||
//
|
|
||||||
// Parameters:
|
|
||||||
// - kmer: the encoded k-mer (2 bits per base)
|
|
||||||
// - k: the k-mer size
|
|
||||||
// - levelMax: maximum sub-word size for entropy (typically 6)
|
|
||||||
//
|
|
||||||
// Returns:
|
|
||||||
// - minimum normalized entropy across all word sizes 1..levelMax
|
|
||||||
func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
||||||
if k < 1 || levelMax < 1 {
|
if k < 1 || levelMax < 1 {
|
||||||
return 1.0
|
return 1.0
|
||||||
@@ -35,7 +34,7 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
|||||||
var seqBuf [32]byte
|
var seqBuf [32]byte
|
||||||
seq := DecodeKmer(kmer, k, seqBuf[:])
|
seq := DecodeKmer(kmer, k, seqBuf[:])
|
||||||
|
|
||||||
// Pre-compute nLogN lookup (same as lowmask)
|
// Pre-compute nLogN lookup
|
||||||
nLogN := make([]float64, k+1)
|
nLogN := make([]float64, k+1)
|
||||||
for i := 1; i <= k; i++ {
|
for i := 1; i <= k; i++ {
|
||||||
nLogN[i] = float64(i) * math.Log(float64(i))
|
nLogN[i] = float64(i) * math.Log(float64(i))
|
||||||
@@ -51,6 +50,23 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Build ln(class_size) tables: for each canonical form, how many raw
|
||||||
|
// words map to it under circular normalization.
|
||||||
|
classLogSizeTables := make([][]float64, levelMax+1)
|
||||||
|
for ws := 1; ws <= levelMax; ws++ {
|
||||||
|
tableSize := 1 << (ws * 2)
|
||||||
|
classSize := make([]int, tableSize)
|
||||||
|
for code := 0; code < tableSize; code++ {
|
||||||
|
classSize[normTables[ws][code]]++
|
||||||
|
}
|
||||||
|
classLogSizeTables[ws] = make([]float64, tableSize)
|
||||||
|
for j := 0; j < tableSize; j++ {
|
||||||
|
if classSize[j] > 0 {
|
||||||
|
classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
minEntropy := math.MaxFloat64
|
minEntropy := math.MaxFloat64
|
||||||
|
|
||||||
for ws := 1; ws <= levelMax; ws++ {
|
for ws := 1; ws <= levelMax; ws++ {
|
||||||
@@ -75,23 +91,13 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
|||||||
table[normWord]++
|
table[normWord]++
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute Shannon entropy
|
// Compute emax over 4^ws raw bins (uncollapsed distribution).
|
||||||
floatNwords := float64(nwords)
|
floatNwords := float64(nwords)
|
||||||
logNwords := math.Log(floatNwords)
|
logNwords := math.Log(floatNwords)
|
||||||
|
na := tableSize // 4^ws
|
||||||
var sumNLogN float64
|
|
||||||
for j := 0; j < tableSize; j++ {
|
|
||||||
n := table[j]
|
|
||||||
if n > 0 {
|
|
||||||
sumNLogN += nLogN[n]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Compute emax (maximum possible entropy for this word size)
|
|
||||||
na := CanonicalCircularKmerCount(ws)
|
|
||||||
var emax float64
|
var emax float64
|
||||||
if nwords < na {
|
if nwords < na {
|
||||||
emax = math.Log(float64(nwords))
|
emax = logNwords
|
||||||
} else {
|
} else {
|
||||||
cov := nwords / na
|
cov := nwords / na
|
||||||
remains := nwords - (na * cov)
|
remains := nwords - (na * cov)
|
||||||
@@ -105,7 +111,19 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
entropy := (logNwords - sumNLogN/floatNwords) / emax
|
// Accumulate Σ f·log(f) and Σ f·log(class_size) over canonical forms.
|
||||||
|
classLogSize := classLogSizeTables[ws]
|
||||||
|
var sumNLogN, sumClassLogN float64
|
||||||
|
for j := 0; j < tableSize; j++ {
|
||||||
|
n := table[j]
|
||||||
|
if n > 0 {
|
||||||
|
sumNLogN += nLogN[n]
|
||||||
|
sumClassLogN += float64(n) * classLogSize[j]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
|
||||||
|
entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
|
||||||
if entropy < 0 {
|
if entropy < 0 {
|
||||||
entropy = 0
|
entropy = 0
|
||||||
}
|
}
|
||||||
@@ -129,24 +147,20 @@ func KmerEntropy(kmer uint64, k int, levelMax int) float64 {
|
|||||||
// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
|
// IMPORTANT: a KmerEntropyFilter is NOT safe for concurrent use.
|
||||||
// Each goroutine must create its own instance via NewKmerEntropyFilter.
|
// Each goroutine must create its own instance via NewKmerEntropyFilter.
|
||||||
type KmerEntropyFilter struct {
|
type KmerEntropyFilter struct {
|
||||||
k int
|
k int
|
||||||
levelMax int
|
levelMax int
|
||||||
threshold float64
|
threshold float64
|
||||||
nLogN []float64
|
nLogN []float64
|
||||||
normTables [][]int
|
normTables [][]int
|
||||||
emaxValues []float64
|
classLogSizeTables [][]float64
|
||||||
logNwords []float64
|
emaxValues []float64
|
||||||
|
logNwords []float64
|
||||||
// Pre-allocated frequency tables reused across Entropy() calls.
|
// Pre-allocated frequency tables reused across Entropy() calls.
|
||||||
// One per word size (index 0 unused). Reset to zero before each use.
|
// One per word size (index 0 unused). Reset to zero before each use.
|
||||||
freqTables [][]int
|
freqTables [][]int
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
|
// NewKmerEntropyFilter creates an entropy filter with pre-computed tables.
|
||||||
//
|
|
||||||
// Parameters:
|
|
||||||
// - k: the k-mer size
|
|
||||||
// - levelMax: maximum sub-word size for entropy (typically 6)
|
|
||||||
// - threshold: entropy threshold (k-mers with entropy <= threshold are rejected)
|
|
||||||
func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
|
func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter {
|
||||||
if levelMax >= k {
|
if levelMax >= k {
|
||||||
levelMax = k - 1
|
levelMax = k - 1
|
||||||
@@ -169,20 +183,38 @@ func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ln(class_size) for each canonical form under circular normalization.
|
||||||
|
classLogSizeTables := make([][]float64, levelMax+1)
|
||||||
|
for ws := 1; ws <= levelMax; ws++ {
|
||||||
|
tableSize := 1 << (ws * 2)
|
||||||
|
classSize := make([]int, tableSize)
|
||||||
|
for code := 0; code < tableSize; code++ {
|
||||||
|
classSize[normTables[ws][code]]++
|
||||||
|
}
|
||||||
|
classLogSizeTables[ws] = make([]float64, tableSize)
|
||||||
|
for j := 0; j < tableSize; j++ {
|
||||||
|
if classSize[j] > 0 {
|
||||||
|
classLogSizeTables[ws][j] = math.Log(float64(classSize[j]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pre-compute emax and logNwords per word size.
|
||||||
|
// emax uses 4^ws raw bins to match the corrected entropy.
|
||||||
emaxValues := make([]float64, levelMax+1)
|
emaxValues := make([]float64, levelMax+1)
|
||||||
logNwords := make([]float64, levelMax+1)
|
logNwords := make([]float64, levelMax+1)
|
||||||
for ws := 1; ws <= levelMax; ws++ {
|
for ws := 1; ws <= levelMax; ws++ {
|
||||||
nw := k - ws + 1
|
nw := k - ws + 1
|
||||||
na := CanonicalCircularKmerCount(ws)
|
na := 1 << (ws * 2) // 4^ws raw bins
|
||||||
|
floatNw := float64(nw)
|
||||||
|
logNwords[ws] = math.Log(floatNw)
|
||||||
if nw < na {
|
if nw < na {
|
||||||
logNwords[ws] = math.Log(float64(nw))
|
emaxValues[ws] = logNwords[ws]
|
||||||
emaxValues[ws] = math.Log(float64(nw))
|
|
||||||
} else {
|
} else {
|
||||||
cov := nw / na
|
cov := nw / na
|
||||||
remains := nw - (na * cov)
|
remains := nw - (na * cov)
|
||||||
f1 := float64(cov) / float64(nw)
|
f1 := float64(cov) / floatNw
|
||||||
f2 := float64(cov+1) / float64(nw)
|
f2 := float64(cov+1) / floatNw
|
||||||
logNwords[ws] = math.Log(float64(nw))
|
|
||||||
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
emaxValues[ws] = -(float64(na-remains)*f1*math.Log(f1) +
|
||||||
float64(remains)*f2*math.Log(f2))
|
float64(remains)*f2*math.Log(f2))
|
||||||
}
|
}
|
||||||
@@ -195,14 +227,15 @@ func NewKmerEntropyFilter(k, levelMax int, threshold float64) *KmerEntropyFilter
|
|||||||
}
|
}
|
||||||
|
|
||||||
return &KmerEntropyFilter{
|
return &KmerEntropyFilter{
|
||||||
k: k,
|
k: k,
|
||||||
levelMax: levelMax,
|
levelMax: levelMax,
|
||||||
threshold: threshold,
|
threshold: threshold,
|
||||||
nLogN: nLogN,
|
nLogN: nLogN,
|
||||||
normTables: normTables,
|
normTables: normTables,
|
||||||
emaxValues: emaxValues,
|
classLogSizeTables: classLogSizeTables,
|
||||||
logNwords: logNwords,
|
emaxValues: emaxValues,
|
||||||
freqTables: freqTables,
|
logNwords: logNwords,
|
||||||
|
freqTables: freqTables,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -236,7 +269,7 @@ func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
|
|||||||
// Count circular-canonical sub-word frequencies
|
// Count circular-canonical sub-word frequencies
|
||||||
tableSize := 1 << (ws * 2)
|
tableSize := 1 << (ws * 2)
|
||||||
table := ef.freqTables[ws]
|
table := ef.freqTables[ws]
|
||||||
clear(table) // reset to zero
|
clear(table)
|
||||||
mask := (1 << (ws * 2)) - 1
|
mask := (1 << (ws * 2)) - 1
|
||||||
normTable := ef.normTables[ws]
|
normTable := ef.normTables[ws]
|
||||||
|
|
||||||
@@ -251,19 +284,21 @@ func (ef *KmerEntropyFilter) Entropy(kmer uint64) float64 {
|
|||||||
table[normWord]++
|
table[normWord]++
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute Shannon entropy
|
|
||||||
floatNwords := float64(nwords)
|
floatNwords := float64(nwords)
|
||||||
logNwords := ef.logNwords[ws]
|
logNwords := ef.logNwords[ws]
|
||||||
|
classLogSize := ef.classLogSizeTables[ws]
|
||||||
|
|
||||||
var sumNLogN float64
|
var sumNLogN, sumClassLogN float64
|
||||||
for j := 0; j < tableSize; j++ {
|
for j := 0; j < tableSize; j++ {
|
||||||
n := table[j]
|
n := table[j]
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
sumNLogN += ef.nLogN[n]
|
sumNLogN += ef.nLogN[n]
|
||||||
|
sumClassLogN += float64(n) * classLogSize[j]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
entropy := (logNwords - sumNLogN/floatNwords) / emax
|
// Corrected entropy: H_raw ≈ log(N) + (Σf·log(s) - Σf·log(f)) / N
|
||||||
|
entropy := (logNwords + sumClassLogN/floatNwords - sumNLogN/floatNwords) / emax
|
||||||
if entropy < 0 {
|
if entropy < 0 {
|
||||||
entropy = 0
|
entropy = 0
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ package obioptions
|
|||||||
// Version is automatically updated by the Makefile from version.txt
|
// Version is automatically updated by the Makefile from version.txt
|
||||||
// The patch number (third digit) is incremented on each push to the repository
|
// The patch number (third digit) is incremented on each push to the repository
|
||||||
|
|
||||||
var _Version = "Release 4.4.41"
|
var _Version = "Release 4.4.44"
|
||||||
|
|
||||||
// Version returns the version of the obitools package.
|
// Version returns the version of the obitools package.
|
||||||
//
|
//
|
||||||
|
|||||||
@@ -364,6 +364,24 @@ func (s *BioSequence) GetIntSlice(key string) ([]int, bool) {
|
|||||||
return val, ok
|
return val, ok
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (s *BioSequence) GetMapOfIntSlice(key string) (map[string][]int, bool) {
|
||||||
|
v, ok := s.GetAttribute(key)
|
||||||
|
if !ok {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
val, err := obiutils.InterfaceToMapOfIntSlice(v)
|
||||||
|
return val, err == nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *BioSequence) GetMapOfStringSlice(key string) (map[string][]string, bool) {
|
||||||
|
v, ok := s.GetAttribute(key)
|
||||||
|
if !ok {
|
||||||
|
return nil, false
|
||||||
|
}
|
||||||
|
val, err := obiutils.InterfaceToMapOfStringSlice(v)
|
||||||
|
return val, err == nil
|
||||||
|
}
|
||||||
|
|
||||||
// Count returns the value of the "count" attribute of the BioSequence.
|
// Count returns the value of the "count" attribute of the BioSequence.
|
||||||
//
|
//
|
||||||
// The count of a sequence is the number of times it has been observed in the dataset.
|
// The count of a sequence is the number of times it has been observed in the dataset.
|
||||||
|
|||||||
@@ -103,7 +103,7 @@ func TestNewBioSequence(t *testing.T) {
|
|||||||
// Return type: None.
|
// Return type: None.
|
||||||
func TestNewBioSequenceWithQualities(t *testing.T) {
|
func TestNewBioSequenceWithQualities(t *testing.T) {
|
||||||
id := "123"
|
id := "123"
|
||||||
sequence := []byte("ATGC")
|
sequence := []byte("atgc")
|
||||||
definition := "DNA sequence"
|
definition := "DNA sequence"
|
||||||
qualities := []byte("1234")
|
qualities := []byte("1234")
|
||||||
|
|
||||||
|
|||||||
@@ -141,6 +141,33 @@ var OBILang = gval.NewLanguage(
|
|||||||
gval.Function("max", func(args ...interface{}) (interface{}, error) {
|
gval.Function("max", func(args ...interface{}) (interface{}, error) {
|
||||||
return obiutils.Max(args[0])
|
return obiutils.Max(args[0])
|
||||||
}),
|
}),
|
||||||
|
gval.Function("which_max", func(args ...interface{}) (interface{}, error) {
|
||||||
|
result, err := obiutils.WhichMax(args[0])
|
||||||
|
if idx, ok := result.(int); ok {
|
||||||
|
return float64(idx), nil
|
||||||
|
}
|
||||||
|
return result, err
|
||||||
|
}),
|
||||||
|
gval.Function("which_min", func(args ...interface{}) (interface{}, error) {
|
||||||
|
result, err := obiutils.WhichMin(args[0])
|
||||||
|
if idx, ok := result.(int); ok {
|
||||||
|
return float64(idx), nil
|
||||||
|
}
|
||||||
|
return result, err
|
||||||
|
}),
|
||||||
|
|
||||||
|
gval.Function("filtermin", func(args ...interface{}) (interface{}, error) {
|
||||||
|
return obiutils.FilterMin(args[0], args[1])
|
||||||
|
}),
|
||||||
|
|
||||||
|
gval.Function("filtermax", func(args ...interface{}) (interface{}, error) {
|
||||||
|
return obiutils.FilterMax(args[0], args[1])
|
||||||
|
}),
|
||||||
|
|
||||||
|
gval.Function("saturatingsub", func(args ...interface{}) (interface{}, error) {
|
||||||
|
return obiutils.SaturatingSub(args[0], args[1])
|
||||||
|
}),
|
||||||
|
|
||||||
gval.Function("contains", func(args ...interface{}) (interface{}, error) {
|
gval.Function("contains", func(args ...interface{}) (interface{}, error) {
|
||||||
if obiutils.IsAMap(args[0]) {
|
if obiutils.IsAMap(args[0]) {
|
||||||
val := reflect.ValueOf(args[0]).MapIndex(reflect.ValueOf(args[1]))
|
val := reflect.ValueOf(args[0]).MapIndex(reflect.ValueOf(args[1]))
|
||||||
|
|||||||
@@ -70,6 +70,12 @@ func (s *BioSequence) SetTaxid(taxid string, rank ...string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} else if obidefault.UseRawTaxids() {
|
||||||
|
// Without a loaded taxonomy, extract the bare ID from full-format strings
|
||||||
|
// like "code:12345 [Name]@rank" so that --raw-taxid is honoured everywhere.
|
||||||
|
if _, rawID, _, _, parseErr := obitax.ParseTaxonString(taxid); parseErr == nil {
|
||||||
|
taxid = rawID
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -177,7 +183,7 @@ func (sequence *BioSequence) SetPath(taxonomy *obitax.Taxonomy) []string {
|
|||||||
lpath := path.Len() - 1
|
lpath := path.Len() - 1
|
||||||
|
|
||||||
for i := lpath; i >= 0; i-- {
|
for i := lpath; i >= 0; i-- {
|
||||||
spath[lpath-i] = path.Get(i).String(taxonomy.Code())
|
spath[lpath-i] = path.Get(i).FullString(taxonomy.Code())
|
||||||
}
|
}
|
||||||
|
|
||||||
sequence.SetAttribute("taxonomic_path", spath)
|
sequence.SetAttribute("taxonomic_path", spath)
|
||||||
|
|||||||
+19
-13
@@ -29,6 +29,24 @@ type TaxNode struct {
|
|||||||
alternatenames *map[*string]*string
|
alternatenames *map[*string]*string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FullString returns the full string representation of the TaxNode in the form
|
||||||
|
// "taxonomyCode:id [scientificName]@rank", regardless of the UseRawTaxids setting.
|
||||||
|
// This is used internally when a parseable format is required (e.g. taxonomic_path).
|
||||||
|
func (node *TaxNode) FullString(taxonomyCode string) string {
|
||||||
|
if node.HasScientificName() {
|
||||||
|
return fmt.Sprintf("%s:%v [%s]@%s",
|
||||||
|
taxonomyCode,
|
||||||
|
*node.id,
|
||||||
|
node.ScientificName(),
|
||||||
|
node.Rank(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
return fmt.Sprintf("%s:%v",
|
||||||
|
taxonomyCode,
|
||||||
|
*node.id)
|
||||||
|
}
|
||||||
|
|
||||||
// String returns a string representation of the TaxNode, including the taxonomy code,
|
// String returns a string representation of the TaxNode, including the taxonomy code,
|
||||||
// the node ID, and the scientific name. The output format is "taxonomyCode:id [scientificName]".
|
// the node ID, and the scientific name. The output format is "taxonomyCode:id [scientificName]".
|
||||||
//
|
//
|
||||||
@@ -42,19 +60,7 @@ func (node *TaxNode) String(taxonomyCode string) string {
|
|||||||
return *node.id
|
return *node.id
|
||||||
}
|
}
|
||||||
|
|
||||||
if node.HasScientificName() {
|
return node.FullString(taxonomyCode)
|
||||||
return fmt.Sprintf("%s:%v [%s]@%s",
|
|
||||||
taxonomyCode,
|
|
||||||
*node.id,
|
|
||||||
node.ScientificName(),
|
|
||||||
node.Rank(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
return fmt.Sprintf("%s:%v",
|
|
||||||
taxonomyCode,
|
|
||||||
*node.id)
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Id returns the unique identifier of the TaxNode.
|
// Id returns the unique identifier of the TaxNode.
|
||||||
|
|||||||
@@ -21,12 +21,10 @@ func PairingOptionSet(options *getoptions.GetOpt) {
|
|||||||
options.StringVar(&_ForwardFile, "forward-reads", "",
|
options.StringVar(&_ForwardFile, "forward-reads", "",
|
||||||
options.Alias("F"),
|
options.Alias("F"),
|
||||||
options.ArgName("FILENAME_F"),
|
options.ArgName("FILENAME_F"),
|
||||||
options.Required("You must provide at a forward file"),
|
|
||||||
options.Description("The file names containing the forward reads"))
|
options.Description("The file names containing the forward reads"))
|
||||||
options.StringVar(&_ReverseFile, "reverse-reads", "",
|
options.StringVar(&_ReverseFile, "reverse-reads", "",
|
||||||
options.Alias("R"),
|
options.Alias("R"),
|
||||||
options.ArgName("FILENAME_R"),
|
options.ArgName("FILENAME_R"),
|
||||||
options.Required("You must provide a reverse file"),
|
|
||||||
options.Description("The file names containing the reverse reads"))
|
options.Description("The file names containing the reverse reads"))
|
||||||
options.IntVar(&_Delta, "delta", _Delta,
|
options.IntVar(&_Delta, "delta", _Delta,
|
||||||
options.Alias("D"),
|
options.Alias("D"),
|
||||||
@@ -72,6 +70,10 @@ func CLIPairedSequence() (obiiter.IBioSequence, error) {
|
|||||||
return paired, nil
|
return paired, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func CLIHasPairedFiles() bool {
|
||||||
|
return _ForwardFile != "" && _ReverseFile != ""
|
||||||
|
}
|
||||||
|
|
||||||
func CLIDelta() int {
|
func CLIDelta() int {
|
||||||
return _Delta
|
return _Delta
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -114,10 +114,10 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
|
|||||||
aanot["obimultiplex_direction"] = direction
|
aanot["obimultiplex_direction"] = direction
|
||||||
|
|
||||||
aanot["obimultiplex_forward_match"] = forward_match
|
aanot["obimultiplex_forward_match"] = forward_match
|
||||||
aanot["obimultiplex_forward_mismatches"] = forward_mismatches
|
aanot["obimultiplex_forward_error"] = forward_mismatches
|
||||||
|
|
||||||
aanot["obimultiplex_reverse_match"] = reverse_match
|
aanot["obimultiplex_reverse_match"] = reverse_match
|
||||||
aanot["obimultiplex_reverse_mismatches"] = reverse_mismatches
|
aanot["obimultiplex_reverse_error"] = reverse_mismatches
|
||||||
|
|
||||||
aanot["sample"] = sample
|
aanot["sample"] = sample
|
||||||
aanot["experiment"] = experiment
|
aanot["experiment"] = experiment
|
||||||
@@ -125,10 +125,10 @@ func IPCRTagPESequencesBatch(iterator obiiter.IBioSequence,
|
|||||||
banot["obimultiplex_direction"] = direction
|
banot["obimultiplex_direction"] = direction
|
||||||
|
|
||||||
banot["obimultiplex_forward_match"] = forward_match
|
banot["obimultiplex_forward_match"] = forward_match
|
||||||
banot["obimultiplex_forward_mismatches"] = forward_mismatches
|
banot["obimultiplex_forward_error"] = forward_mismatches
|
||||||
|
|
||||||
banot["obimultiplex_reverse_match"] = reverse_match
|
banot["obimultiplex_reverse_match"] = reverse_match
|
||||||
banot["obimultiplex_reverse_mismatches"] = reverse_mismatches
|
banot["obimultiplex_reverse_error"] = reverse_mismatches
|
||||||
|
|
||||||
banot["sample"] = sample
|
banot["sample"] = sample
|
||||||
banot["experiment"] = experiment
|
banot["experiment"] = experiment
|
||||||
|
|||||||
@@ -276,6 +276,44 @@ func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func InterfaceToMapOfIntSlice(i interface{}) (val map[string][]int, err error) {
|
||||||
|
err = nil
|
||||||
|
switch m := i.(type) {
|
||||||
|
case map[string][]int:
|
||||||
|
val = m
|
||||||
|
case map[string]interface{}:
|
||||||
|
val = make(map[string][]int, len(m))
|
||||||
|
for k, v := range m {
|
||||||
|
val[k], err = InterfaceToIntSlice(v)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
err = &NotAMapInt{"value attribute cannot be casted to a map[string][]int"}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
func InterfaceToMapOfStringSlice(i interface{}) (val map[string][]string, err error) {
|
||||||
|
err = nil
|
||||||
|
switch m := i.(type) {
|
||||||
|
case map[string][]string:
|
||||||
|
val = m
|
||||||
|
case map[string]interface{}:
|
||||||
|
val = make(map[string][]string, len(m))
|
||||||
|
for k, v := range m {
|
||||||
|
val[k], err = InterfaceToStringSlice(v)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
err = &NotAMapInt{"value attribute cannot be casted to a map[string][]string"}
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
func InterfaceToStringSlice(i interface{}) (val []string, err error) {
|
func InterfaceToStringSlice(i interface{}) (val []string, err error) {
|
||||||
err = nil
|
err = nil
|
||||||
|
|
||||||
|
|||||||
+365
-4
@@ -34,6 +34,26 @@ func MinMaxSlice[T constraints.Ordered](vec []T) (min, max T) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func FilterMinSlice[T constraints.Ordered](vec []T, minimum T) []T {
|
||||||
|
result := make([]T, 0, len(vec))
|
||||||
|
for _, v := range vec {
|
||||||
|
if v >= minimum {
|
||||||
|
result = append(result, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func FilterMaxSlice[T constraints.Ordered](vec []T, maximum T) []T {
|
||||||
|
result := make([]T, 0, len(vec))
|
||||||
|
for _, v := range vec {
|
||||||
|
if v <= maximum {
|
||||||
|
result = append(result, v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
func MaxMap[K comparable, T constraints.Ordered](values map[K]T) (K, T, error) {
|
func MaxMap[K comparable, T constraints.Ordered](values map[K]T) (K, T, error) {
|
||||||
var maxKey K
|
var maxKey K
|
||||||
var maxValue T
|
var maxValue T
|
||||||
@@ -73,6 +93,46 @@ func MinMap[K comparable, T constraints.Ordered](values map[K]T) (K, T, error) {
|
|||||||
return minKey, minValue, nil
|
return minKey, minValue, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func FilterMinMap[K comparable, T constraints.Ordered](values map[K]T, minimum T) map[K]T {
|
||||||
|
result := make(map[K]T)
|
||||||
|
for k, v := range values {
|
||||||
|
if v >= minimum {
|
||||||
|
result[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func FilterMaxMap[K comparable, T constraints.Ordered](values map[K]T, maximum T) map[K]T {
|
||||||
|
result := make(map[K]T)
|
||||||
|
for k, v := range values {
|
||||||
|
if v <= maximum {
|
||||||
|
result[k] = v
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func SaturatingSubSlice[T Numeric](vec []T, sub T) []T {
|
||||||
|
result := make([]T, len(vec))
|
||||||
|
for i, v := range vec {
|
||||||
|
if v > sub {
|
||||||
|
result[i] = v - sub
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
|
func SaturatingSubMap[K comparable, T Numeric](values map[K]T, sub T) map[K]T {
|
||||||
|
result := make(map[K]T)
|
||||||
|
for k, v := range values {
|
||||||
|
if v > sub {
|
||||||
|
result[k] = v - sub
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
|
|
||||||
// Min returns the smallest element in a slice/array or map,
|
// Min returns the smallest element in a slice/array or map,
|
||||||
// or the value itself if data is a single comparable value.
|
// or the value itself if data is a single comparable value.
|
||||||
// Returns an error if the container is empty or the type is unsupported.
|
// Returns an error if the container is empty or the type is unsupported.
|
||||||
@@ -135,11 +195,121 @@ func Max(data interface{}) (interface{}, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func FilterMin(data interface{}, minimum interface{}) (interface{}, error) {
|
||||||
|
v := reflect.ValueOf(data)
|
||||||
|
switch v.Kind() {
|
||||||
|
case reflect.Slice, reflect.Array:
|
||||||
|
if v.Len() == 0 {
|
||||||
|
return nil, errors.New("empty slice or array")
|
||||||
|
}
|
||||||
|
return filterMinFromIterable(v, minimum)
|
||||||
|
case reflect.Map:
|
||||||
|
if v.Len() == 0 {
|
||||||
|
return nil, errors.New("empty map")
|
||||||
|
}
|
||||||
|
return filterMinFromMap(v, minimum)
|
||||||
|
default:
|
||||||
|
if !isOrderedKind(v.Kind()) {
|
||||||
|
return nil, fmt.Errorf("unsupported type: %s", v.Kind())
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func FilterMax(data interface{}, maximum interface{}) (interface{}, error) {
|
||||||
|
v := reflect.ValueOf(data)
|
||||||
|
switch v.Kind() {
|
||||||
|
case reflect.Slice, reflect.Array:
|
||||||
|
if v.Len() == 0 {
|
||||||
|
return nil, errors.New("empty slice or array")
|
||||||
|
}
|
||||||
|
return filterMaxFromIterable(v, maximum)
|
||||||
|
case reflect.Map:
|
||||||
|
if v.Len() == 0 {
|
||||||
|
return nil, errors.New("empty map")
|
||||||
|
}
|
||||||
|
return filterMaxFromMap(v, maximum)
|
||||||
|
default:
|
||||||
|
if !isOrderedKind(v.Kind()) {
|
||||||
|
return nil, fmt.Errorf("unsupported type: %s", v.Kind())
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func SaturatingSub(data interface{}, sub interface{}) (interface{}, error) {
|
||||||
|
v := reflect.ValueOf(data)
|
||||||
|
switch v.Kind() {
|
||||||
|
case reflect.Slice, reflect.Array:
|
||||||
|
return saturatingSubFromIterable(v, sub)
|
||||||
|
case reflect.Map:
|
||||||
|
return saturatingSubFromMap(v, sub)
|
||||||
|
default:
|
||||||
|
if !isNumericKind(v.Kind()) {
|
||||||
|
return nil, fmt.Errorf("unsupported type: %s", v.Kind())
|
||||||
|
}
|
||||||
|
r, err := saturatingSubValues(v, reflect.ValueOf(sub))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return r.Interface(), nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func saturatingSubFromIterable(v reflect.Value, sub interface{}) (interface{}, error) {
|
||||||
|
subVal := reflect.ValueOf(sub)
|
||||||
|
result := reflect.MakeSlice(v.Type(), v.Len(), v.Len())
|
||||||
|
for i := 0; i < v.Len(); i++ {
|
||||||
|
r, err := saturatingSubValues(v.Index(i), subVal)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
result.Index(i).Set(r)
|
||||||
|
}
|
||||||
|
return result.Interface(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func saturatingSubFromMap(v reflect.Value, sub interface{}) (interface{}, error) {
|
||||||
|
subVal := reflect.ValueOf(sub)
|
||||||
|
result := reflect.MakeMap(v.Type())
|
||||||
|
for _, key := range v.MapKeys() {
|
||||||
|
r, err := saturatingSubValues(v.MapIndex(key), subVal)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
if !r.IsZero() {
|
||||||
|
result.SetMapIndex(key, r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.Interface(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func saturatingSubValues(a, b reflect.Value) (reflect.Value, error) {
|
||||||
|
result := reflect.New(a.Type()).Elem()
|
||||||
|
switch a.Kind() {
|
||||||
|
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
|
||||||
|
if av, bv := a.Int(), b.Int(); av > bv {
|
||||||
|
result.SetInt(av - bv)
|
||||||
|
}
|
||||||
|
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64:
|
||||||
|
if av, bv := a.Uint(), b.Uint(); av > bv {
|
||||||
|
result.SetUint(av - bv)
|
||||||
|
}
|
||||||
|
case reflect.Float32, reflect.Float64:
|
||||||
|
if av, bv := a.Float(), b.Float(); av > bv {
|
||||||
|
result.SetFloat(av - bv)
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return reflect.Value{}, fmt.Errorf("unsupported type for saturating subtraction: %s", a.Kind())
|
||||||
|
}
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
// maxFromIterable scans a slice/array to find the maximum.
|
// maxFromIterable scans a slice/array to find the maximum.
|
||||||
func maxFromIterable(v reflect.Value) (interface{}, error) {
|
func maxFromIterable(v reflect.Value) (interface{}, error) {
|
||||||
var best reflect.Value
|
var best reflect.Value
|
||||||
for i := 0; i < v.Len(); i++ {
|
for i := 0; i < v.Len(); i++ {
|
||||||
elem := v.Index(i)
|
elem := unwrapInterface(v.Index(i))
|
||||||
if !isOrderedKind(elem.Kind()) {
|
if !isOrderedKind(elem.Kind()) {
|
||||||
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
}
|
}
|
||||||
@@ -154,7 +324,7 @@ func maxFromIterable(v reflect.Value) (interface{}, error) {
|
|||||||
func minFromIterable(v reflect.Value) (interface{}, error) {
|
func minFromIterable(v reflect.Value) (interface{}, error) {
|
||||||
var minVal reflect.Value
|
var minVal reflect.Value
|
||||||
for i := 0; i < v.Len(); i++ {
|
for i := 0; i < v.Len(); i++ {
|
||||||
elem := v.Index(i)
|
elem := unwrapInterface(v.Index(i))
|
||||||
if !isOrderedKind(elem.Kind()) {
|
if !isOrderedKind(elem.Kind()) {
|
||||||
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
}
|
}
|
||||||
@@ -165,12 +335,182 @@ func minFromIterable(v reflect.Value) (interface{}, error) {
|
|||||||
return minVal.Interface(), nil
|
return minVal.Interface(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func filterMinFromIterable(v reflect.Value, minimum interface{}) (interface{}, error) {
|
||||||
|
minVal := reflect.ValueOf(minimum)
|
||||||
|
result := reflect.MakeSlice(v.Type(), 0, v.Len())
|
||||||
|
for i := 0; i < v.Len(); i++ {
|
||||||
|
elem := unwrapInterface(v.Index(i))
|
||||||
|
if !isOrderedKind(elem.Kind()) {
|
||||||
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
|
}
|
||||||
|
if !less(elem, minVal) { // elem >= minimum
|
||||||
|
result = reflect.Append(result, elem)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.Interface(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterMaxFromIterable(v reflect.Value, maximum interface{}) (interface{}, error) {
|
||||||
|
maxVal := reflect.ValueOf(maximum)
|
||||||
|
result := reflect.MakeSlice(v.Type(), 0, v.Len())
|
||||||
|
for i := 0; i < v.Len(); i++ {
|
||||||
|
elem := unwrapInterface(v.Index(i))
|
||||||
|
if !isOrderedKind(elem.Kind()) {
|
||||||
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
|
}
|
||||||
|
if !greater(elem, maxVal) { // elem <= maximum
|
||||||
|
result = reflect.Append(result, elem)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.Interface(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// whichMaxFromIterable returns the index of the maximum element in a slice/array.
|
||||||
|
func whichMaxFromIterable(v reflect.Value) (int, error) {
|
||||||
|
var best reflect.Value
|
||||||
|
bestIdx := 0
|
||||||
|
for i := 0; i < v.Len(); i++ {
|
||||||
|
elem := unwrapInterface(v.Index(i))
|
||||||
|
if !isOrderedKind(elem.Kind()) {
|
||||||
|
return 0, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
|
}
|
||||||
|
if i == 0 || greater(elem, best) {
|
||||||
|
best = elem
|
||||||
|
bestIdx = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bestIdx, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// whichMinFromIterable returns the index of the minimum element in a slice/array.
|
||||||
|
func whichMinFromIterable(v reflect.Value) (int, error) {
|
||||||
|
var minVal reflect.Value
|
||||||
|
minIdx := 0
|
||||||
|
for i := 0; i < v.Len(); i++ {
|
||||||
|
elem := unwrapInterface(v.Index(i))
|
||||||
|
if !isOrderedKind(elem.Kind()) {
|
||||||
|
return 0, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
|
}
|
||||||
|
if i == 0 || less(elem, minVal) {
|
||||||
|
minVal = elem
|
||||||
|
minIdx = i
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return minIdx, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// whichMaxFromMap returns the key associated with the maximum value in a map.
|
||||||
|
func whichMaxFromMap(v reflect.Value) (interface{}, error) {
|
||||||
|
var best reflect.Value
|
||||||
|
var bestKey reflect.Value
|
||||||
|
first := true
|
||||||
|
for _, key := range v.MapKeys() {
|
||||||
|
elem := unwrapInterface(v.MapIndex(key))
|
||||||
|
if !isOrderedKind(elem.Kind()) {
|
||||||
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
|
}
|
||||||
|
if first || greater(elem, best) {
|
||||||
|
best = elem
|
||||||
|
bestKey = key
|
||||||
|
first = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bestKey.Interface(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// whichMinFromMap returns the key associated with the minimum value in a map.
|
||||||
|
func whichMinFromMap(v reflect.Value) (interface{}, error) {
|
||||||
|
var minVal reflect.Value
|
||||||
|
var minKey reflect.Value
|
||||||
|
first := true
|
||||||
|
for _, key := range v.MapKeys() {
|
||||||
|
elem := unwrapInterface(v.MapIndex(key))
|
||||||
|
if !isOrderedKind(elem.Kind()) {
|
||||||
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
|
}
|
||||||
|
if first || less(elem, minVal) {
|
||||||
|
minVal = elem
|
||||||
|
minKey = key
|
||||||
|
first = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return minKey.Interface(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// WhichMax returns the key (for a map) or index (for a slice/array) of the maximum value.
|
||||||
|
func WhichMax(data interface{}) (interface{}, error) {
|
||||||
|
v := reflect.ValueOf(data)
|
||||||
|
switch v.Kind() {
|
||||||
|
case reflect.Slice, reflect.Array:
|
||||||
|
if v.Len() == 0 {
|
||||||
|
return nil, errors.New("empty slice or array")
|
||||||
|
}
|
||||||
|
return whichMaxFromIterable(v)
|
||||||
|
case reflect.Map:
|
||||||
|
if v.Len() == 0 {
|
||||||
|
return nil, errors.New("empty map")
|
||||||
|
}
|
||||||
|
return whichMaxFromMap(v)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unsupported type: %s", v.Kind())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WhichMin returns the key (for a map) or index (for a slice/array) of the minimum value.
|
||||||
|
func WhichMin(data interface{}) (interface{}, error) {
|
||||||
|
v := reflect.ValueOf(data)
|
||||||
|
switch v.Kind() {
|
||||||
|
case reflect.Slice, reflect.Array:
|
||||||
|
if v.Len() == 0 {
|
||||||
|
return nil, errors.New("empty slice or array")
|
||||||
|
}
|
||||||
|
return whichMinFromIterable(v)
|
||||||
|
case reflect.Map:
|
||||||
|
if v.Len() == 0 {
|
||||||
|
return nil, errors.New("empty map")
|
||||||
|
}
|
||||||
|
return whichMinFromMap(v)
|
||||||
|
default:
|
||||||
|
return nil, fmt.Errorf("unsupported type: %s", v.Kind())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterMinFromMap(v reflect.Value, minimum interface{}) (interface{}, error) {
|
||||||
|
minVal := reflect.ValueOf(minimum)
|
||||||
|
result := reflect.MakeMap(v.Type())
|
||||||
|
for _, key := range v.MapKeys() {
|
||||||
|
elem := unwrapInterface(v.MapIndex(key))
|
||||||
|
if !isOrderedKind(elem.Kind()) {
|
||||||
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
|
}
|
||||||
|
if !less(elem, minVal) { // elem >= minimum
|
||||||
|
result.SetMapIndex(key, elem)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.Interface(), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func filterMaxFromMap(v reflect.Value, maximum interface{}) (interface{}, error) {
|
||||||
|
maxVal := reflect.ValueOf(maximum)
|
||||||
|
result := reflect.MakeMap(v.Type())
|
||||||
|
for _, key := range v.MapKeys() {
|
||||||
|
elem := unwrapInterface(v.MapIndex(key))
|
||||||
|
if !isOrderedKind(elem.Kind()) {
|
||||||
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
|
}
|
||||||
|
if !greater(elem, maxVal) { // elem <= maximum
|
||||||
|
result.SetMapIndex(key, elem)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result.Interface(), nil
|
||||||
|
}
|
||||||
|
|
||||||
// maxFromMap scans map values to find the maximum.
|
// maxFromMap scans map values to find the maximum.
|
||||||
func maxFromMap(v reflect.Value) (interface{}, error) {
|
func maxFromMap(v reflect.Value) (interface{}, error) {
|
||||||
var best reflect.Value
|
var best reflect.Value
|
||||||
first := true
|
first := true
|
||||||
for _, key := range v.MapKeys() {
|
for _, key := range v.MapKeys() {
|
||||||
elem := v.MapIndex(key)
|
elem := unwrapInterface(v.MapIndex(key))
|
||||||
if !isOrderedKind(elem.Kind()) {
|
if !isOrderedKind(elem.Kind()) {
|
||||||
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
}
|
}
|
||||||
@@ -187,7 +527,7 @@ func minFromMap(v reflect.Value) (interface{}, error) {
|
|||||||
var minVal reflect.Value
|
var minVal reflect.Value
|
||||||
first := true
|
first := true
|
||||||
for _, key := range v.MapKeys() {
|
for _, key := range v.MapKeys() {
|
||||||
elem := v.MapIndex(key)
|
elem := unwrapInterface(v.MapIndex(key))
|
||||||
if !isOrderedKind(elem.Kind()) {
|
if !isOrderedKind(elem.Kind()) {
|
||||||
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
return nil, fmt.Errorf("unsupported element type: %s", elem.Kind())
|
||||||
}
|
}
|
||||||
@@ -199,6 +539,27 @@ func minFromMap(v reflect.Value) (interface{}, error) {
|
|||||||
return minVal.Interface(), nil
|
return minVal.Interface(), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isNumericKind(k reflect.Kind) bool {
|
||||||
|
switch k {
|
||||||
|
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64,
|
||||||
|
reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64,
|
||||||
|
reflect.Float32, reflect.Float64:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// unwrapInterface returns v.Elem() when v holds an interface value, otherwise v unchanged.
|
||||||
|
// This is necessary when iterating map[string]interface{} or []interface{} via reflection:
|
||||||
|
// the element Kind is reflect.Interface, not the underlying concrete type.
|
||||||
|
func unwrapInterface(v reflect.Value) reflect.Value {
|
||||||
|
if v.Kind() == reflect.Interface {
|
||||||
|
return v.Elem()
|
||||||
|
}
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
// isOrderedKind reports whether k supports comparison ordering.
|
// isOrderedKind reports whether k supports comparison ordering.
|
||||||
func isOrderedKind(k reflect.Kind) bool {
|
func isOrderedKind(k reflect.Kind) bool {
|
||||||
switch k {
|
switch k {
|
||||||
|
|||||||
+1
-1
@@ -1 +1 @@
|
|||||||
4.4.41
|
4.4.44
|
||||||
|
|||||||
Reference in New Issue
Block a user