Compare commits

...

43 Commits

Author SHA1 Message Date
Eric Coissac
7b23314651 Some typos 2025-03-01 08:29:27 +01:00
Eric Coissac
1e541eac4c Last commit version 2025-03-01 08:24:26 +01:00
Eric Coissac
13cd4c86ac Patch the bug on --out with paired sequence files 2025-02-27 18:13:21 +01:00
Eric Coissac
75dd535201 Add a --valid-taxid option to obigrep 2025-02-27 18:12:55 +01:00
Eric Coissac
573acafafc Patch bug on ecotag with too short sequences 2025-02-27 15:09:07 +01:00
Eric Coissac
0067152c2b Patch the production of the ratio file 2025-02-27 10:19:39 +01:00
Eric Coissac
791d253edc Generate the ratio file as compressed if -Z option enabled. 2025-02-27 09:06:07 +01:00
Eric Coissac
6245d7f684 Changes to be committed:
modified:   .gitignore
2025-02-24 15:47:45 +01:00
Eric Coissac
13d610aff7 Changes to be committed:
modified:   pkg/obioptions/version.go
	modified:   pkg/obitools/obiclean/chimera.go
2025-02-24 15:25:45 +01:00
Eric Coissac
db284f1d44 Add an experimental chimera detection... 2025-02-24 15:02:49 +01:00
Eric Coissac
51b3e83d32 some cleaning 2025-02-24 11:31:49 +01:00
Eric Coissac
8671285d02 add the --min-sample-count option to obiclean. 2025-02-24 08:48:31 +01:00
Eric Coissac
51d11aa36d Changes to be committed:
modified:   pkg/obialign/alignment.go
	modified:   pkg/obialign/pairedendalign.go
	modified:   pkg/obioptions/version.go
	modified:   pkg/obitools/obipairing/pairing.go
2025-02-23 17:37:56 +01:00
Eric Coissac
fb6f857d8c Update the computation of the consensus quality score 2025-02-23 15:16:31 +01:00
Eric Coissac
d4209b4549 Add a basic test for obiparing 2025-02-22 09:57:44 +01:00
Eric Coissac
ef05d4975f Upadte the scoring schema of obipairing 2025-02-21 22:41:34 +01:00
Eric Coissac
4588bf8b5d Patch the make file to fail on error 2025-02-19 15:55:07 +01:00
Eric Coissac
090633850d Changes to be committed:
modified:   obitests/obitools/obicount/test.sh
2025-02-19 15:28:42 +01:00
Eric Coissac
15a058cf63 with all the sample files for tests 2025-02-19 15:27:38 +01:00
Eric Coissac
2f5f7634d6 Changes to be committed:
modified:   obitests/obitools/obicount/test.sh
2025-02-19 14:50:10 +01:00
Eric Coissac
48138b605c Changes to be committed:
modified:   .github/workflows/obitest.yml
	modified:   Makefile
	modified:   obitests/obitools/obicount/test.sh
2025-02-19 14:37:05 +01:00
Eric Coissac
aed22c12a6 Changes to be committed:
modified:   obitests/obitools/obicount/test.sh
2025-02-19 14:34:22 +01:00
Eric Coissac
443a9b3ce3 Changes to be committed:
modified:   Makefile
	modified:   obitests/obitools/obicount/test.sh
2025-02-19 14:28:49 +01:00
Eric Coissac
7e90537379 For run of test using bash in makefile 2025-02-19 13:58:52 +01:00
Eric Coissac
d3d15acc6c Changes to be committed:
modified:   obitests/obitools/obicount/test.sh
	modified:   pkg/obioptions/version.go
2025-02-19 13:54:01 +01:00
Eric Coissac
bd4a0b5ca5 Essais d'une google action pour lancer les tests des obitools 2025-02-19 13:45:43 +01:00
Eric Coissac
952f85f312 A first trial of a test for obicount 2025-02-19 13:17:36 +01:00
Eric Coissac
4774438644 Changes to be committed:
modified:   pkg/obiformats/universal_read.go
	modified:   pkg/obioptions/version.go
	modified:   pkg/obiseq/taxonomy_methods.go
2025-02-12 08:40:38 +01:00
Eric Coissac
6a8061cc4f Add managment of the taxonomy alias politic 2025-02-10 14:05:47 +01:00
Eric Coissac
e2563cd8df Patch a bug in registering merged taxa 2025-02-10 11:42:46 +01:00
Eric Coissac
f2e81adf95 Changes to be committed:
modified:   .gitignore
	deleted:    xxx.csv
2025-02-05 19:28:19 +01:00
Eric Coissac
f27e9bc91e patch a bug related to csv and qualities 2025-02-05 19:27:00 +01:00
Eric Coissac
773e54965d Patch a bug on compressed output 2025-02-05 14:18:24 +01:00
Eric Coissac
ceca33998b add extensions fq in directory scanning 2025-02-04 20:34:58 +01:00
Eric Coissac
b9bee5f426 Changes to be committed:
modified:   go.mod
	modified:   go.sum
	modified:   pkg/obilua/obilib.go
	modified:   pkg/obilua/obiseq.go
	modified:   pkg/obilua/obiseqslice.go
	new file:   pkg/obilua/obitaxon.go
	new file:   pkg/obilua/obitaxonomy.go
	modified:   pkg/obioptions/version.go
2025-02-02 16:52:52 +01:00
Eric Coissac
c10df073a7 Changes to be committed:
modified:   pkg/obioptions/version.go
	modified:   pkg/obitax/iterator.go
2025-02-01 12:06:19 +01:00
Eric Coissac
d3dac1b21f Make obitag able to use the taxonomic path included in reference database as taxonomy 2025-01-30 11:50:03 +01:00
Eric Coissac
0df082da06 Adds possibility to extract a taxonomy from taxonomic path included in sequence files 2025-01-30 11:18:21 +01:00
Eric Coissac
2452aef7a9 patch multiple -Z options 2025-01-29 21:35:28 +01:00
Eric Coissac
337954592d add the --out option to the obitaxonomy 2025-01-29 13:22:35 +01:00
Eric Coissac
8a28c9ae7c add the --download-ncbi option to obitaxonomy 2025-01-29 12:38:39 +01:00
Eric Coissac
b6b18c0fa1 Changes to be committed:
modified:   pkg/obioptions/version.go
2025-01-29 11:34:01 +01:00
Eric Coissac
67e2758d63 Switch to realease number 4.3.0 2025-01-29 11:33:30 +01:00
74 changed files with 2063 additions and 5096 deletions

19
.github/workflows/obitest.yml vendored Normal file
View File

@@ -0,0 +1,19 @@
name: "Run the obitools command test suite"
on:
push:
branches:
- master
- V*
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Setup Go
uses: actions/setup-go@v2
with:
go-version: '1.23'
- name: Checkout obitools4 project
uses: actions/checkout@v4
- name: Run tests
run: make githubtests

153
.gitignore vendored
View File

@@ -1,134 +1,27 @@
cpu.pprof **/cpu.pprof
cpu.trace **/cpu.trace
test **/test
bin **/bin
vendor **/vendor
*.fastq **/*.fastq
*.fasta **/*.fasta
*.fastq.gz **/*.fastq.gz
*.fasta.gz **/*.fasta.gz
.DS_Store **/.DS_Store
*.gml **/*.gml
*.log **/*.log
/argaly **/xxx*
**/*.sav
**/*.old
**/*.tgz
**/*.yaml
**/*.csv
/obiconvert .rhistory
/obicount /.vscode
/obimultiplex
/obipairing
/obipcr
/obifind
/obidistribute
/obiuniq
/build /build
/Makefile.old
.Rproj.user
obitools.Rproj
Stat_error.knit.md
.Rhistory
Stat_error.nb.html
Stat_error.Rmd
/.luarc.json /ncbitaxo
/doc/TAXO/
/doc/results/
/doc/_main.log
/doc/_book/_main.tex
/doc/_freeze/
/doc/tutorial_files/
/doc/wolf_data/
/taxdump/
/.vscode/
/Algo-Alignement.numbers !/obitests/**
/Estimate_proba_true_seq.html !/sample/**
/Estimate_proba_true_seq.nb.html
/Estimate_proba_true_seq.Rmd
/modele_error_euka.qmd
/obitools.code-workspace
.DS_Store
.RData
x
xxx
y
/doc/wolf_diet.tgz
/doc/man/depends
/sample/wolf_R1.fasta.gz
/sample/wolf_R2.fasta.gz
/sample/euka03.ecotag.fasta.gz
/sample/ratio.csv
/sample/STD_PLN_1.dat
/sample/STD_PLN_2.dat
/sample/subset_Pasvik_R1.fastq.gz
/sample/subset_Pasvik_R2.fastq.gz
/sample/test_gobitools.fasta.bz2
euka03.csv*
gbbct793.seq.gz
gbinv1003.seq.gz
gbpln210.seq
/doc/book/OBITools-V4.aux
/doc/book/OBITools-V4.fdb_latexmk
/doc/book/OBITools-V4.fls
/doc/book/OBITools-V4.log
/doc/book/OBITools-V4.pdf
/doc/book/OBITools-V4.synctex.gz
/doc/book/OBITools-V4.tex
/doc/book/OBITools-V4.toc
getoptions.adoc
Archive.zip
.DS_Store
sample/.DS_Store
sample/consensus_graphs/specimen_hac_plants_Vern_disicolor_.gml
93954
Bact03.e5.gb_R254.obipcr.idx.fasta.save
sample/test.obipcr.log
Bact02.e3.gb_R254.obipcr.fasta.gz
Example_Arth03.ngsfilter
SPER01.csv
SPER03.csv
wolf_diet_ngsfilter.txt
xx
xxx.gb
yyy_geom.csv
yyy_LCS.csv
yyy.json
bug_obimultiplex/toto
bug_obimultiplex/toto_mapping
bug_obimultiplex/tutu
bug_obimultiplex/tutu_mapping
bug_obipairing/GIT1_GH_ngsfilter.txt
doc/book/TAXO/citations.dmp
doc/book/TAXO/delnodes.dmp
doc/book/TAXO/division.dmp
doc/book/TAXO/gc.prt
doc/book/TAXO/gencode.dmp
doc/book/TAXO/merged.dmp
doc/book/TAXO/names.dmp
doc/book/TAXO/nodes.dmp
doc/book/TAXO/readme.txt
doc/book/wolf_data/Release-253/ncbitaxo/citations.dmp
doc/book/wolf_data/Release-253/ncbitaxo/delnodes.dmp
doc/book/wolf_data/Release-253/ncbitaxo/division.dmp
doc/book/wolf_data/Release-253/ncbitaxo/gc.prt
doc/book/wolf_data/Release-253/ncbitaxo/gencode.dmp
doc/book/wolf_data/Release-253/ncbitaxo/merged.dmp
doc/book/wolf_data/Release-253/ncbitaxo/names.dmp
doc/book/wolf_data/Release-253/ncbitaxo/nodes.dmp
doc/book/wolf_data/Release-253/ncbitaxo/readme.txt
doc/book/results/toto.tasta
sample/.DS_Store
GO
ncbitaxo/citations.dmp
ncbitaxo/delnodes.dmp
ncbitaxo/division.dmp
ncbitaxo/gc.prt
ncbitaxo/gencode.dmp
ncbitaxo/merged.dmp
ncbitaxo/names.dmp
ncbitaxo/nodes.dmp
ncbitaxo/readme.txt
template.16S
xxx.gz
*.sav
*.old
ncbitaxo.tgz

View File

@@ -63,6 +63,13 @@ update-deps:
test: test:
$(GOTEST) ./... $(GOTEST) ./...
obitests:
@for t in $$(find obitests -name test.sh -print) ; do \
bash $${t} ;\
done
githubtests: obitools obitests
man: man:
make -C doc man make -C doc man
@@ -97,5 +104,5 @@ ifneq ($(strip $(COMMIT_ID)),)
@rm -f $(OUTPUT) @rm -f $(OUTPUT)
endif endif
.PHONY: all packages obitools man obibook doc update-deps .FORCE .PHONY: all packages obitools man obibook doc update-deps obitests githubtests .FORCE
.FORCE: .FORCE:

View File

@@ -37,7 +37,7 @@ curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install
bash -s -- --install-dir test_install --obitools-prefix k bash -s -- --install-dir test_install --obitools-prefix k
``` ```
In this case, the binaries will be installed in the `test_install` directory and all command names will be prefixed with the letter `k`. Thus `obigrep` will be named `kobigrep`. In this case, the binaries will be installed in the `test_install` directory and all command names will be prefixed with the letter `k`. Thus, `obigrep` will be named `kobigrep`.
## Continuing the analysis... ## Continuing the analysis...

View File

@@ -1,19 +1,29 @@
# OBITools release notes # OBITools release notes
## Latest changes ## March 2nd, 2025. Release 4.3.0
A new documentation website is available at https://obitools4.metabarcoding.org.
Its development is still in progress.
### Breaking changes ### Breaking changes
- In `obimultiplex`, the short version of the **--tag-list** option used to specify the list - In `obimultiplex`, the short version of the **--tag-list** option used to
of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`. specify the list of tags and primers to be used for the demultiplexing has
been changed from `-t` to `-s`.
- The command `obifind` is now renamed `obitaxonomy`. - The command `obifind` is now renamed `obitaxonomy`.
- The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy - The **--taxdump** option used to specify the path to the taxdump containing
has been renamed to **--taxonomy**. the NCBI taxonomy has been renamed to **--taxonomy**.
### Bug fixes ### Bug fixes
- Correction of a bug when using paired sequence file with the **--out** option.
- Correction of a bug in `obitag` when trying to annotate very short sequences of
4 bases or less.
- In `obipairing`, correct the stats `seq_a_single` and `seq_b_single` when - In `obipairing`, correct the stats `seq_a_single` and `seq_b_single` when
on right alignment mode on right alignment mode
@@ -21,12 +31,32 @@
the batch size and not reading the qualities from the fastq files as `obiuniq` the batch size and not reading the qualities from the fastq files as `obiuniq`
is producing only fasta output without qualities. is producing only fasta output without qualities.
- In `obitag`, correct the wrong assignment of the **obitag_bestmatch**
attribute.
- In `obiclean`, the **--no-progress-bar** option disables all progress bars,
not just the data.
- Several fixes in reading FASTA and FASTQ files, including some code
simplification and factorization.
- Fixed a bug in all obitools that caused the same file to be processed
multiple times, when specifying a directory name as input.
### New features ### New features
- `obigrep` add a new **--valid-taxid** option to keep only sequence with a
valid taxid
- `obiclean` add a new **--min-sample-count** option with a default value of 1,
asking to filter out sequences which are not occurring in at least the
specified number of samples.
- `obitoaxonomy` a new **--dump|D** option allows for dumping a sub-taxonomy. - `obitoaxonomy` a new **--dump|D** option allows for dumping a sub-taxonomy.
- Taxonomy dump can now be provided as a four-columns CSV file to the **--taxonomy** - Taxonomy dump can now be provided as a four-columns CSV file to the
option. **--taxonomy** option.
- NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The - NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The
path of the tar and gziped dump file can be directly specified using the path of the tar and gziped dump file can be directly specified using the
@@ -37,54 +67,50 @@
allow the processing of the rare fasta and fastq files not recognized. allow the processing of the rare fasta and fastq files not recognized.
- In `obiscript`, adds new methods to the Lua sequence object: - In `obiscript`, adds new methods to the Lua sequence object:
- `md5_string()`: returning the MD5 check sum as an hexadecimal string, - `md5_string()`: returning the MD5 check sum as a hexadecimal string,
- `subsequence(from,to)`: allows to extract a subsequence on a 0 based - `subsequence(from,to)`: allows extracting a subsequence on a 0 based
coordinate system, upper bound expluded like in go. coordinate system, upper bound excluded like in go.
- `reverse_complement`: returning a sequence object corresponding to the reverse complement - `reverse_complement`: returning a sequence object corresponding to the
of the current sequence. reverse complement of the current sequence.
### Change of git repositiory ### Enhancement
- The OBITools4 git repository has been moved to the github repository. - In every *OBITools* command, the progress bar is automatically deactivated
when the standard error output is redirected.
- Because Genbank and ENA:EMBL contain very large sequences, while OBITools4
are optimized As Genbank and ENA:EMBL contain very large sequences, while
OBITools4 is optimized for short sequences, `obipcr` faces some problems
with excessive consumption of computer resources, especially memory. Several
improvements in the tuning of the default `obipcr` parameters and some new
features, currently only available for FASTA and FASTQ file readers, have
been implemented to limit the memory impact of `obipcr` without changing the
computational efficiency too much.
- Logging system and therefore format, have been homogenized.
### Change of git repository
- The OBITools4 git repository has been moved to the GitHub repository.
The new address is: https://github.com/metabarcoding/obitools4. The new address is: https://github.com/metabarcoding/obitools4.
Take care for using the new install script for retrieving the new version. Take care for using the new install script for retrieving the new version.
```bash ```bash
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh \ curl -L https://metabarcoding.org/obitools4/install.sh \
| bash | bash
``` ```
or with options: or with options:
```bash ```bash
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh \ curl -L https://metabarcoding.org/obitools4/install.sh \
| bash -s -- --install-dir test_install --obitools-prefix k | bash -s -- --install-dir test_install --obitools-prefix k
``` ```
### CPU limitation
- By default, *OBITools4* tries to use all the computing power available on
your computer. In some circumstances this can be problematic (e.g. if you
are running on a computer cluster managed by your university). You can limit
the number of CPU cores used by *OBITools4* or by using the **--max-cpu**
option or by setting the **OBIMAXCPU** environment variable. Some strange
behaviour of *OBITools4* has been observed when users try to limit the
maximum number of usable CPU cores to one. This seems to be caused by the Go
language, and it is not obvious to get *OBITools4* to run correctly on a
single core in all circumstances. Therefore, if you ask to use a single
core, **OBITools4** will print a warning message and actually set this
parameter to two cores. If you really want a single core, you can use the
**--force-one-core** option. But be aware that this can lead to incorrect
calculations.
### New features
- The output of the obitools will evolve to produce results only in standard - The output of the obitools will evolve to produce results only in standard
formats such as fasta and fastq. For non-sequential data, the output will be formats such as fasta and fastq. For non-sequential data, the output will be
in CSV format, with the separator `,`, the decimal separator `.`, and a in CSV format, with the separator `,`, the decimal separator `.`, and a
header line with the column names. It is more convenient to use the output header line with the column names. It is more convenient to use the output
in other programs. For example, you can use the `csvtomd` command to in other programs. For example, you can use the `csvtomd` command to
reformat the csv output into a markdown table. The first command to initiate reformat the CSV output into a Markdown table. The first command to initiate
this change is `obicount`, which now produces a 3-line CSV output. this change is `obicount`, which now produces a 3-line CSV output.
```bash ```bash
@@ -96,7 +122,7 @@
database for `obitag` is to use `obipcr` on a local copy of Genbank or EMBL. database for `obitag` is to use `obipcr` on a local copy of Genbank or EMBL.
However, these sequence databases are known to contain many taxonomic However, these sequence databases are known to contain many taxonomic
errors, such as bacterial sequences annotated with the taxid of their host errors, such as bacterial sequences annotated with the taxid of their host
species. obicleandb tries to detect these errors. To do this, it first keeps species. `obicleandb` tries to detect these errors. To do this, it first keeps
only sequences annotated with the taxid to which a species, genus, and only sequences annotated with the taxid to which a species, genus, and
family taxid can be assigned. Then, for each sequence, it compares the family taxid can be assigned. Then, for each sequence, it compares the
distance of the sequence to the other sequences belonging to the same genus distance of the sequence to the other sequences belonging to the same genus
@@ -107,7 +133,7 @@
with the p-value of the Mann-Whitney U test in the **obicleandb_trusted** with the p-value of the Mann-Whitney U test in the **obicleandb_trusted**
slot. Later, the distribution of this p-value can be analyzed to determine a slot. Later, the distribution of this p-value can be analyzed to determine a
threshold. Empirically, a threshold of 0.05 is a good compromise and allows threshold. Empirically, a threshold of 0.05 is a good compromise and allows
to filter out less than 1‰ of the sequences. These sequences can then be filtering out less than 1‰ of the sequences. These sequences can then be
removed using `obigrep`. removed using `obigrep`.
- Adds a new `obijoin` utility to join information contained in a sequence - Adds a new `obijoin` utility to join information contained in a sequence
@@ -117,16 +143,16 @@
- Adds a new tool `obidemerge` to demerge a `merge_xxx` slot by recreating the - Adds a new tool `obidemerge` to demerge a `merge_xxx` slot by recreating the
multiple identical sequences having the slot `xxx` recreated with its initial multiple identical sequences having the slot `xxx` recreated with its initial
value and the sequence count set to the number of occurences refered in the value and the sequence count set to the number of occurrences referred in the
`merge_xxx` slot. During the operation, the `merge_xxx` slot is removed. `merge_xxx` slot. During the operation, the `merge_xxx` slot is removed.
- Adds CSV as one of the input format for every obitools command. To encode - Adds CSV as one of the input format for every obitools command. To encode
sequence the CSV file must includes a column named `sequence` and another sequence the CSV file must include a column named `sequence` and another
column named `id`. An extra column named `qualities` can be added to specify column named `id`. An extra column named `qualities` can be added to specify
the quality scores of the sequence following the same ascii encoding than the the quality scores of the sequence following the same ASCII encoding than the
fastq format. All the other columns will be considered as annotations and will fastq format. All the other columns will be considered as annotations and will
be interpreted as JSON objects encoding potentially for atomic values. If a be interpreted as JSON objects encoding potentially for atomic values. If a
calumn value can not be decoded as JSON it will be considered as a string. column value can not be decoded as JSON it will be considered as a string.
- A new option **--version** has been added to every obitools command. It will - A new option **--version** has been added to every obitools command. It will
print the version of the command. print the version of the command.
@@ -135,8 +161,8 @@
quality scores from a BioSequence object.\ quality scores from a BioSequence object.\
- In `obimultuplex` the ngsfilter file describing the samples can be no provided - In `obimultuplex` the ngsfilter file describing the samples can be no provided
not only using the classical nfsfilter format but also using the csv format. not only using the classical ngsfilter format but also using the CSV format.
When using csv, the first line must contain the column names. 5 columns are When using CSV, the first line must contain the column names. 5 columns are
expected: expected:
- `experiment` the name of the experiment - `experiment` the name of the experiment
@@ -152,43 +178,34 @@
Supplementary columns are allowed. Their names and content will be used to Supplementary columns are allowed. Their names and content will be used to
annotate the sequence corresponding to the sample, as the `key=value;` did annotate the sequence corresponding to the sample, as the `key=value;` did
in the nfsfilter format. in the ngsfilter format.
The CSV format used allows for comment lines starting with `#` character. The CSV format used allows for comment lines starting with `#` character.
Special data lines starting with `@param` in the first column allow to Special data lines starting with `@param` in the first column allow configuring the algorithm. The options **--template** provided an over
configure the algorithm. The options **--template** provided an over commented example of the CSV format, including all the possible options.
commented example of the csv format, including all the possible options.
### CPU limitation
### Enhancement - By default, *OBITools4* tries to use all the computing power available on
your computer. In some circumstances this can be problematic (e.g. if you
are running on a computer cluster managed by your university). You can limit
the number of CPU cores used by *OBITools4* or by using the **--max-cpu**
option or by setting the **OBIMAXCPU** environment variable. Some strange
behavior of *OBITools4* has been observed when users try to limit the
maximum number of usable CPU cores to one. This seems to be caused by the Go
language, and it is not obvious to get *OBITools4* to run correctly on a
single core in all circumstances. Therefore, if you ask to use a single
core, **OBITools4** will print a warning message and actually set this
parameter to two cores. If you really want a single core, you can use the
**--force-one-core** option. But be aware that this can lead to incorrect
calculations.
- In every *OBITools* command, the progress bar are automatically deactivated
when the standard error output is redirected.
- Because Genbank and ENA:EMBL contain very large sequences, while OBITools4
are optimized As Genbank and ENA:EMBL contain very large sequences, while
OBITools4 is optimised for short sequences, `obipcr` faces some problems
with excessive consumption of computer resources, especially memory. Several
improvements in the tuning of the default `obipcr` parameters and some new
features, currently only available for FASTA and FASTQ file readers, have
been implemented to limit the memory impact of `obipcr` without changing the
computational efficiency too much.
- Logging system and therefore format, have been homogenized.
### Bug
- In `obitag`, correct the wrong assignment of the **obitag_bestmatch**
attribute.
- In `obiclean`, the **--no-progress-bar** option disables all progress bars,
not just the data.
- Several fixes in reading FASTA and FASTQ files, including some code
simplification and and factorization.
- Fixed a bug in all obitools that caused the same file to be processed
multiple times. when specifying a directory name as input.
## April 2nd, 2024. Release 4.2.0 ## April 2nd, 2024. Release 4.2.0
### New features ### New features
- A new OBITools named `obiscript` allows to process each sequence according - A new OBITools named `obiscript` allows processing each sequence according
to a Lua script. This is an experimental tool. The **--template** option to a Lua script. This is an experimental tool. The **--template** option
allows for generating an example script on the `stdout`. allows for generating an example script on the `stdout`.
@@ -196,7 +213,7 @@
- Two of the main class `obiseq.SeqWorker` and `obiseq.SeqWorker` have their - Two of the main class `obiseq.SeqWorker` and `obiseq.SeqWorker` have their
declaration changed. Both now return two values a `obiseq.BioSequenceSlice` declaration changed. Both now return two values a `obiseq.BioSequenceSlice`
and an `error`. This allow a worker to return potentially several sequences and an `error`. This allows a worker to return potentially several sequences
as the result of the processing of a single sequence, or zero, which is as the result of the processing of a single sequence, or zero, which is
equivalent to filter out the input sequence. equivalent to filter out the input sequence.
@@ -204,12 +221,12 @@
- In `obitag` if the reference database contains sequences annotated by taxid - In `obitag` if the reference database contains sequences annotated by taxid
not referenced in the taxonomy, the corresponding sequences are discarded not referenced in the taxonomy, the corresponding sequences are discarded
from the reference database and a warning indicating the sequence id and the from the reference database and a warning indicating the sequence *id* and the
wrong taxid is emitted. wrong taxid is emitted.
- The bug corrected in the parsing of EMBL and Genbank files as implemented in - The bug corrected in the parsing of EMBL and Genbank files as implemented in
version 4.1.2 of OBITools4, potentially induced some reduction in the version 4.1.2 of OBITools4, potentially induced some reduction in the
performance of the parsing. This should have been now fixed. performance of the parsing. This should have been now fixed.
- In the same idea, parsing of genbank and EMBL files were reading and storing - In the same idea, parsing of Genbank and EMBL files were reading and storing
in memory not only the sequence but also the annotations (features table). in memory not only the sequence but also the annotations (features table).
Up to now none of the OBITools are using this information, but with large Up to now none of the OBITools are using this information, but with large
complete genomes, it is occupying a lot of memory. To reduce this impact, complete genomes, it is occupying a lot of memory. To reduce this impact,
@@ -248,7 +265,7 @@
### New feature ### New feature
- In `obimatrix` a **--transpose** option allows to transpose the produced - In `obimatrix` a **--transpose** option allows transposing the produced
matrix table in CSV format. matrix table in CSV format.
- In `obitpairing` and `obipcrtag` two new options **--exact-mode** and - In `obitpairing` and `obipcrtag` two new options **--exact-mode** and
**--fast-absolute** to control the heuristic used in the alignment **--fast-absolute** to control the heuristic used in the alignment
@@ -256,7 +273,7 @@
the exact algorithm at the cost of a speed. **--fast-absolute** change the the exact algorithm at the cost of a speed. **--fast-absolute** change the
scoring schema of the heuristic. scoring schema of the heuristic.
- In `obiannotate` adds the possibility to annotate the first match of a - In `obiannotate` adds the possibility to annotate the first match of a
pattern using the same algorithm than the one used in `obipcr` and pattern using the same algorithm as the one used in `obipcr` and
`obimultiplex`. For that four option were added : `obimultiplex`. For that four option were added :
- **--pattern** : to specify the pattern. It can use IUPAC codes and - **--pattern** : to specify the pattern. It can use IUPAC codes and
position with no error tolerated has to be followed by a `#` character. position with no error tolerated has to be followed by a `#` character.
@@ -337,7 +354,7 @@
### Bugs ### Bugs
- in the obitools language, the `composition` function now returns a map - In the obitools language, the `composition` function now returns a map
indexed by lowercase string "a", "c", "g", "t" and "o" for other instead of indexed by lowercase string "a", "c", "g", "t" and "o" for other instead of
being indexed by the ASCII codes of the corresponding letters. being indexed by the ASCII codes of the corresponding letters.
- Correction of the reverse-complement operation. Every reverse complement of - Correction of the reverse-complement operation. Every reverse complement of
@@ -350,18 +367,18 @@
duplicating the quality values. This made `obimultiplex` to produce fastq duplicating the quality values. This made `obimultiplex` to produce fastq
files with sequences having quality values duplicated. files with sequences having quality values duplicated.
### Becareful ### Be careful
GO 1.21.0 is out, and it includes new functionalities which are used in the GO 1.21.0 is out, and it includes new functionalities which are used in the
OBITools4 code. If you use the recommanded method for compiling OBITools on your OBITools4 code. If you use the recommended method for compiling OBITools on your
computer, their is no problem, as the script always load the latest GO version. computer, there is no problem, as the script always load the latest GO version.
If you rely on you personnal GO install, please think to update. If you rely on your personal GO install, please think to update.
## August 29th, 2023. Release 4.0.5 ## August 29th, 2023. Release 4.0.5
### Bugs ### Bugs
- Patch a bug in the `obiseq.BioSequence` constructor leading to a error on - Patch a bug in the `obiseq.BioSequence` constructor leading to an error on
almost every obitools. The error message indicates : `fatal error: sync: almost every obitools. The error message indicates : `fatal error: sync:
unlock of unlocked mutex` This bug was introduced in the release 4.0.4 unlock of unlocked mutex` This bug was introduced in the release 4.0.4
@@ -380,7 +397,7 @@ If you rely on you personnal GO install, please think to update.
data structure to limit the number of alignments actually computed. This data structure to limit the number of alignments actually computed. This
increase a bit the speed of both the software. `obirefidx` is nevertheless increase a bit the speed of both the software. `obirefidx` is nevertheless
still too slow compared to my expectation. still too slow compared to my expectation.
- Switch to a parallel version of the gzip library, allowing for high speed - Switch to a parallel version of the GZIP library, allowing for high speed
compress and decompress operation on files. compress and decompress operation on files.
### New feature ### New feature
@@ -424,12 +441,12 @@ If you rely on you personnal GO install, please think to update.
--unidentified not_assigned.fastq --unidentified not_assigned.fastq
``` ```
the command produced four files : `tagged_library_R1.fastq` and The command produced four files : `tagged_library_R1.fastq` and
`tagged_library_R2.fastq` containing the assigned reads and `tagged_library_R2.fastq` containing the assigned reads and
`not_assigned_R1.fastq` and `not_assigned_R2.fastq` containing the `not_assigned_R1.fastq` and `not_assigned_R2.fastq` containing the
unassignable reads. unassignable reads.
the tagged library files can then be split using `obidistribute`: The tagged library files can then be split using `obidistribute`:
```{bash} ```{bash}
mkdir pcr_reads mkdir pcr_reads
@@ -439,9 +456,9 @@ If you rely on you personnal GO install, please think to update.
- Adding of two options **--add-lca-in** and **--lca-error** to `obiannotate`. - Adding of two options **--add-lca-in** and **--lca-error** to `obiannotate`.
These options aim to help during construction of reference database using These options aim to help during construction of reference database using
`obipcr`. On obipcr output, it is commonly run obiuniq. To merge identical `obipcr`. On `obipcr` output, it is commonly run `obiuniq`. To merge identical
sequences annotated with different taxids, it is now possible to use the sequences annotated with different taxids, it is now possible to use the
following strategie : following strategies :
```{bash} ```{bash}
obiuniq -m taxid myrefdb.obipcr.fasta \ obiuniq -m taxid myrefdb.obipcr.fasta \
@@ -472,7 +489,7 @@ If you rely on you personnal GO install, please think to update.
- Correction of a bug in `obiconsensus` leading into the deletion of a base - Correction of a bug in `obiconsensus` leading into the deletion of a base
close to the beginning of the consensus sequence. close to the beginning of the consensus sequence.
## March 31th, 2023. Release 4.0.2 ## March 31st, 2023. Release 4.0.2
### Compiler change ### Compiler change
@@ -483,15 +500,15 @@ If you rely on you personnal GO install, please think to update.
- Add the possibility for looking pattern with indels. This has been added to - Add the possibility for looking pattern with indels. This has been added to
`obimultiplex` through the **--with-indels** option. `obimultiplex` through the **--with-indels** option.
- Every obitools command has a **--pprof** option making the command - Every obitools command has a **--pprof** option making the command
publishing a profiling web site available at the address : publishing a profiling website available at the address :
<http://localhost:8080/debug/pprof/> <http://localhost:8080/debug/pprof/>
- A new `obiconsensus` command has been added. It is a prototype. It aims to - A new `obiconsensus` command has been added. It is a prototype. It aims to
build a consensus sequence from a set of reads. The consensus is estimated build a consensus sequence from a set of reads. The consensus is estimated
for all the sequences contained in the input file. If several input files, for all the sequences contained in the input file. If several input files,
or a directory name are provided the result contains a consensus per file. or a directory name are provided the result contains a consensus per file.
The id of the sequence is the name of the input file depleted of its The *id* of the sequence is the name of the input file depleted of its
directory name and of all its extensions. directory name and of all its extensions.
- In `obipcr` an experimental option **--fragmented** allows for spliting very - In `obipcr` an experimental option **--fragmented** allows for splitting very
long query sequences into shorter fragments with an overlap between the two long query sequences into shorter fragments with an overlap between the two
contiguous fragment insuring that no amplicons are missed despite the split. contiguous fragment insuring that no amplicons are missed despite the split.
As a site effect some amplicon can be identified twice. As a site effect some amplicon can be identified twice.
@@ -534,7 +551,7 @@ If you rely on you personnal GO install, please think to update.
### Enhancement ### Enhancement
- *OBITools* are automatically processing all the sequences files contained in - *OBITools* are automatically processing all the sequences files contained in
a directory and its sub-directory\ a directory and its subdirectory\
recursively if its name is provided as input. To process easily Genbank recursively if its name is provided as input. To process easily Genbank
files, the corresponding filename extensions have been added. Today the files, the corresponding filename extensions have been added. Today the
following extensions are recognized as sequence files : `.fasta`, `.fastq`, following extensions are recognized as sequence files : `.fasta`, `.fastq`,
@@ -551,7 +568,7 @@ If you rely on you personnal GO install, please think to update.
export OBICPUMAX=4 export OBICPUMAX=4
``` ```
- Adds a new option --out\|-o allowing to specify the name of an outpout file. - Adds a new option --out\|-o allowing to specify the name of an output file.
``` bash ``` bash
obiconvert -o xyz.fasta xxx.fastq obiconvert -o xyz.fasta xxx.fastq
@@ -573,10 +590,10 @@ If you rely on you personnal GO install, please think to update.
matched files remain consistent when processed. matched files remain consistent when processed.
- Adding of the function `ifelse` to the expression language for computing - Adding of the function `ifelse` to the expression language for computing
conditionnal values. conditional values.
- Adding two function to the expression language related to sequence - Adding two function to the expression language related to sequence
conposition : `composition` and `gcskew`. Both are taking a sequence as composition : `composition` and `gcskew`. Both are taking a sequence as
single argument. single argument.
## February 18th, 2023. Release 4.0.0 ## February 18th, 2023. Release 4.0.0
@@ -584,8 +601,8 @@ If you rely on you personnal GO install, please think to update.
It is the first version of the *OBITools* version 4. I decided to tag then It is the first version of the *OBITools* version 4. I decided to tag then
following two weeks of intensive data analysis with them allowing to discover following two weeks of intensive data analysis with them allowing to discover
many small bugs present in the previous non-official version. Obviously other many small bugs present in the previous non-official version. Obviously other
bugs are certainly persent in the code, and you are welcome to use the git bugs are certainly present in the code, and you are welcome to use the git
ticket system to mention them. But they seems to produce now reliable results. ticket system to mention them. But they seem to produce now reliable results.
### Corrected bugs ### Corrected bugs
@@ -593,11 +610,11 @@ ticket system to mention them. But they seems to produce now reliable results.
of sequences and to the production of incorrect file because of the last of sequences and to the production of incorrect file because of the last
sequence record, sometime truncated in its middle. This was only occurring sequence record, sometime truncated in its middle. This was only occurring
when more than a single CPU was used. It was affecting every obitools. when more than a single CPU was used. It was affecting every obitools.
- The `obiparing` software had a bug in the right aligment procedure. This led - The `obiparing` software had a bug in the right alignment procedure. This led
to the non alignment of very sort barcode during the paring of the forward to the non-alignment of very sort barcode during the paring of the forward
and reverse reads. and reverse reads.
- The `obipairing` tools had a non deterministic comportment when aligning a - The `obipairing` tools had a non-deterministic comportment when aligning a
paor very low quality reads. This induced that the result of the same low pair very low quality reads. This induced that the result of the same low
quality read pair was not the same from run to run. quality read pair was not the same from run to run.
### New features ### New features
@@ -605,11 +622,10 @@ ticket system to mention them. But they seems to produce now reliable results.
- Adding of a `--compress|-Z` option to every obitools allowing to produce - Adding of a `--compress|-Z` option to every obitools allowing to produce
`gz` compressed output. OBITools were already able to deal with gziped input `gz` compressed output. OBITools were already able to deal with gziped input
files transparently. They can now produce their results in the same format. files transparently. They can now produce their results in the same format.
- Adding of a `--append|-A` option to the `obidistribute` tool. It allows to - Adding of a `--append|-A` option to the `obidistribute` tool. It allows appending the result of an `obidistribute` execution to preexisting files. -
append the result of an `obidistribute` execution to preexisting files. -
Adding of a `--directory|-d` option to the `obidistribute` tool. It allows Adding of a `--directory|-d` option to the `obidistribute` tool. It allows
to declare a secondary classification key over the one defined by the declaring a secondary classification key over the one defined by the
'--category\|-c\` option. This extra key leads to produce directories in `--category\|-c\` option. This extra key leads to produce directories in
which files produced according to the primary criterion are stored. which files produced according to the primary criterion are stored.
- Adding of the functions `subspc`, `printf`, `int`, `numeric`, and `bool` to - Adding of the functions `subspc`, `printf`, `int`, `numeric`, and `bool` to
the expression language. the expression language.

View File

@@ -47,12 +47,27 @@ func main() {
obiconvert.OpenSequenceDataErrorMessage(args, err) obiconvert.OpenSequenceDataErrorMessage(args, err)
taxo := obitax.DefaultTaxonomy() taxo := obitax.DefaultTaxonomy()
references := obitag.CLIRefDB()
if references == nil {
log.Panicln("No loaded reference database")
}
if taxo == nil {
taxo, err = references.ExtractTaxonomy(nil)
if err != nil {
log.Fatalf("No taxonomy specified or extractable from reference database: %v", err)
}
taxo.SetAsDefault()
}
if taxo == nil { if taxo == nil {
log.Panicln("No loaded taxonomy") log.Panicln("No loaded taxonomy")
} }
references := obitag.CLIRefDB()
var identified obiiter.IBioSequence var identified obiiter.IBioSequence
if obitag.CLIGeometricMode() { if obitag.CLIGeometricMode() {

View File

@@ -1,13 +1,16 @@
package main package main
import ( import (
"log"
"os" "os"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitaxonomy" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitaxonomy"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
) )
func main() { func main() {
@@ -18,17 +21,49 @@ func main() {
var iterator *obitax.ITaxon var iterator *obitax.ITaxon
switch { switch {
case obitaxonomy.CLIDownloadNCBI():
err := obitaxonomy.CLIDownloadNCBITaxdump()
if err != nil {
log.Errorf("Cannot download NCBI taxonomy: %s", err.Error())
os.Exit(1)
}
os.Exit(0)
case obitaxonomy.CLIExtractTaxonomy():
iter, err := obiconvert.CLIReadBioSequences(args...)
if err != nil {
log.Fatalf("Cannot extract taxonomy: %v", err)
}
taxonomy, err := iter.ExtractTaxonomy()
if err != nil {
log.Fatalf("Cannot extract taxonomy: %v", err)
}
taxonomy.SetAsDefault()
log.Infof("Number of extracted taxa: %d", taxonomy.Len())
iterator = taxonomy.AsTaxonSet().Sort().Iterator()
case obitaxonomy.CLIDumpSubtaxonomy(): case obitaxonomy.CLIDumpSubtaxonomy():
iterator = obitaxonomy.CLISubTaxonomyIterator() iterator = obitaxonomy.CLISubTaxonomyIterator()
case obitaxonomy.CLIRequestsPathForTaxid() != "NA": case obitaxonomy.CLIRequestsPathForTaxid() != "NA":
taxon := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid()) taxon, isAlias, err := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid())
if taxon == nil { if err != nil {
log.Fatalf("Cannot identify the requested taxon: %s", log.Fatalf("Cannot identify the requested taxon: %s (%v)",
obitaxonomy.CLIRequestsPathForTaxid()) obitaxonomy.CLIRequestsPathForTaxid(), err)
}
if isAlias {
if obidefault.FailOnTaxonomy() {
log.Fatalf("Taxon %s is an alias for %s", taxon.String(), taxon.Parent().String())
}
} }
s := taxon.Path() s := taxon.Path()

4
go.mod
View File

@@ -5,7 +5,9 @@ go 1.23.1
require ( require (
github.com/DavidGamba/go-getoptions v0.28.0 github.com/DavidGamba/go-getoptions v0.28.0
github.com/PaesslerAG/gval v1.2.2 github.com/PaesslerAG/gval v1.2.2
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df
github.com/buger/jsonparser v1.1.1
github.com/chen3feng/stl4go v0.1.1 github.com/chen3feng/stl4go v0.1.1
github.com/dlclark/regexp2 v1.11.4 github.com/dlclark/regexp2 v1.11.4
github.com/goccy/go-json v0.10.3 github.com/goccy/go-json v0.10.3
@@ -24,8 +26,6 @@ require (
) )
require ( require (
github.com/Clever/csvlint v0.3.0 // indirect
github.com/buger/jsonparser v1.1.1 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect github.com/davecgh/go-spew v1.1.1 // indirect
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
github.com/kr/pretty v0.3.0 // indirect github.com/kr/pretty v0.3.0 // indirect

5
go.sum
View File

@@ -1,11 +1,11 @@
github.com/Clever/csvlint v0.3.0 h1:58WEFXWy+i0fCbxTXscR2QwYESRuAUFjEGLgZs6j2iU=
github.com/Clever/csvlint v0.3.0/go.mod h1:+wLRuW/bI8NhpRoeyUBxqKsK35OhvgJhXHSWdKp5XJU=
github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c= github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c=
github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84= github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E= github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E=
github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac= github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI= github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8= github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9 h1:Zc1/GNsUpgZR9qm1EmRSKrnOHA7CCd0bIzGdq0cREN0=
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9/go.mod h1:PZyV4WA3NpqtezSY0h6E6NARAmdDm0qwrydveOyR5Gc=
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0=
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM=
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs= github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
@@ -69,7 +69,6 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=

View File

@@ -0,0 +1,144 @@
#!/bin/bash
#
# Here give the name of the test serie
#
TEST_NAME=obicount
######
#
# Some variable and function definitions: please don't change them
#
######
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
export PATH="${OBITOOLS_DIR}:${PATH}"
TMPDIR="$(mktemp -d)"
ntest=0
success=0
failed=0
cleanup() {
echo "========================================" 1>&2
echo "## Results of the $TEST_NAME tests:" 1>&2
echo 1>&2
echo "- $ntest tests run" 1>&2
echo "- $success successfully completed" 1>&2
echo "- $failed failed tests" 1>&2
echo 1>&2
echo "Cleaning up the temporary directory..." 1>&2
echo 1>&2
echo "========================================" 1>&2
rm -rf "$TMPDIR" # Suppress the temporary directory
if [ $failed -gt 0 ]; then
log "$TEST_NAME tests failed"
exit 1
fi
exit 0
}
log() {
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
}
log "Testing $TEST_NAME..."
log "Test directory is $TEST_DIR"
log "obitools directory is $OBITOOLS_DIR"
log "Temporary directory is $TMPDIR"
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
######################################################################
####
#### Below are the tests
####
#### Before each test :
#### - increment the variable ntest
####
#### Run the command as the condition of an if / then /else
#### - The command must return 0 on success
#### - The command must return an exit code different from 0 on failure
#### - The datafiles are stored in the same directory than the test script
#### - The test script directory is stored in the TEST_DIR variable
#### - If result files have to be produced they must be stored
#### in the temporary directory (TMPDIR variable)
####
#### then clause is executed on success of the command
#### - Write a success message using the log function
#### - increment the variable success
####
#### else clause is executed on failure of the command
#### - Write a failure message using the log function
#### - increment the variable failed
####
######################################################################
((ntest++))
if obicount "${TEST_DIR}/wolf_F.fasta.gz" \
> "${TMPDIR}/wolf_F.fasta_count.csv"
then
log "OBICount: fasta reading OK"
((success++))
else
log "OBICount: fasta reading failed"
((failed++))
fi
((ntest++))
if obicount "${TEST_DIR}/wolf_F.fastq.gz" \
> "${TMPDIR}/wolf_F.fastq_count.csv"
then
log "OBICount: fastq reading OK"
((success++))
else
log "OBICount: fastq reading failed"
((failed++))
fi
((ntest++))
if obicount "${TEST_DIR}/wolf_F.csv.gz" \
> "${TMPDIR}/wolf_F.csv_count.csv"
then
log "OBICount: csv reading OK"
((success++))
else
log "OBICount: csv reading failed"
((failed++))
fi
((ntest++))
if diff "${TMPDIR}/wolf_F.fasta_count.csv" \
"${TMPDIR}/wolf_F.fastq_count.csv" > /dev/null
then
log "OBICount: counting on fasta and fastq are identical OK"
((success++))
else
log "OBICount: counting on fasta and fastq are different failed"
((failed++))
fi
((ntest++))
if diff "${TMPDIR}/wolf_F.fasta_count.csv" \
"${TMPDIR}/wolf_F.csv_count.csv" > /dev/null
then
log "OBICount: counting on fasta and csv are identical OK"
((success++))
else
log "OBICount: counting on fasta and csv are different failed"
((failed++))
fi
#########################################
#
# At the end of the tests
# the cleanup function is called
#
#########################################
cleanup

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,134 @@
#!/bin/bash
#
# Here give the name of the test serie
#
TEST_NAME=obiparing
######
#
# Some variable and function definitions: please don't change them
#
######
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
export PATH="${OBITOOLS_DIR}:${PATH}"
TMPDIR="$(mktemp -d)"
ntest=0
success=0
failed=0
cleanup() {
echo "========================================" 1>&2
echo "## Results of the $TEST_NAME tests:" 1>&2
echo 1>&2
echo "- $ntest tests run" 1>&2
echo "- $success successfully completed" 1>&2
echo "- $failed failed tests" 1>&2
echo 1>&2
echo "Cleaning up the temporary directory..." 1>&2
echo 1>&2
echo "========================================" 1>&2
rm -rf "$TMPDIR" # Suppress the temporary directory
if [ $failed -gt 0 ]; then
log "$TEST_NAME tests failed"
exit 1
fi
exit 0
}
log() {
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
}
log "Testing $TEST_NAME..."
log "Test directory is $TEST_DIR"
log "obitools directory is $OBITOOLS_DIR"
log "Temporary directory is $TMPDIR"
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
######################################################################
####
#### Below are the tests
####
#### Before each test :
#### - increment the variable ntest
####
#### Run the command as the condition of an if / then /else
#### - The command must return 0 on success
#### - The command must return an exit code different from 0 on failure
#### - The datafiles are stored in the same directory than the test script
#### - The test script directory is stored in the TEST_DIR variable
#### - If result files have to be produced they must be stored
#### in the temporary directory (TMPDIR variable)
####
#### then clause is executed on success of the command
#### - Write a success message using the log function
#### - increment the variable success
####
#### else clause is executed on failure of the command
#### - Write a failure message using the log function
#### - increment the variable failed
####
######################################################################
((ntest++))
if obipairing -F "${TEST_DIR}/wolf_F.fastq.gz" \
-R "${TEST_DIR}/wolf_R.fastq.gz" \
| obidistribute -Z -c mode \
-p "${TMPDIR}/wolf_paired_%s.fastq.gz"
then
log "OBIPairing: sequence pairing OK"
((success++))
else
log "OBIPairing: sequence pairing failed"
((failed++))
fi
((ntest++))
if obicsv -Z -s -i \
-k ali_dir -k ali_length -k paring_fast_count \
-k paring_fast_overlap -k paring_fast_score \
-k score -k score_norm -k seq_a_single \
-k seq_b_single -k seq_ab_match \
"${TMPDIR}/wolf_paired_alignment.fastq.gz" \
> "${TMPDIR}/wolf_paired_alignment.csv.gz" \
&& zdiff -c "${TEST_DIR}/wolf_paired_alignment.csv.gz" \
"${TMPDIR}/wolf_paired_alignment.csv.gz"
then
log "OBIPairing: check aligned sequences OK"
((success++))
else
log "OBIPairing: check aligned sequences failed"
((failed++))
fi
((ntest++))
if obicsv -Z -s -i \
"${TMPDIR}/wolf_paired_join.fastq.gz" \
> "${TMPDIR}/wolf_paired_join.csv.gz" \
&& zdiff -c "${TEST_DIR}/wolf_paired_join.csv.gz" \
"${TMPDIR}/wolf_paired_join.csv.gz"
then
log "OBIPairing: check joined sequences OK"
((success++))
else
log "OBIPairing: check joined sequences failed"
((failed++))
fi
#########################################
#
# At the end of the tests
# the cleanup function is called
#
#########################################
cleanup

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -10,6 +10,7 @@ import (
"strings" "strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
) )
// // A pool of byte slices. // // A pool of byte slices.
@@ -158,12 +159,30 @@ func BuildQualityConsensus(seqA, seqB *obiseq.BioSequence, path []int, statOnMis
match := 0 match := 0
left := obiutils.Abs(path[0])
right := 0
if path[len(path)-1] == 0 {
right = path[len(path)-2]
}
right = obiutils.Abs(right)
right = len(*bufferQA) - right
// log.Warnf("BuildQualityConsensus: left = %d right = %d\n", left, right)
for i, qA = range *bufferQA { for i, qA = range *bufferQA {
nA := (*bufferSA)[i] nA := (*bufferSA)[i]
nB := (*bufferSB)[i] nB := (*bufferSB)[i]
qB = (*bufferQB)[i] qB = (*bufferQB)[i]
if statOnMismatch && nA != nB && nA != ' ' && nB != ' ' { if statOnMismatch && i >= left && i < right && nA != nB {
if nA == ' ' {
nA = '-'
}
if nB == ' ' {
nB = '-'
}
mismatches[strings.ToUpper(fmt.Sprintf("(%c:%02d)->(%c:%02d)", nA, qA, nB, qB))] = i + 1 mismatches[strings.ToUpper(fmt.Sprintf("(%c:%02d)->(%c:%02d)", nA, qA, nB, qB))] = i + 1
} }
@@ -183,13 +202,12 @@ func BuildQualityConsensus(seqA, seqB *obiseq.BioSequence, path []int, statOnMis
q := qA + qB q := qA + qB
if qA > 0 && qB > 0 { if nA != nB {
if nA != nB { q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/40))*10+0.5)
q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/30))*10+0.5) }
}
if nA == nB { if nA == nB {
match++ match++
}
} }
if q > 90 { if q > 90 {

View File

@@ -74,6 +74,30 @@ func _Logaddexp(a, b float64) float64 {
return b + math.Log1p(math.Exp(a-b)) return b + math.Log1p(math.Exp(a-b))
} }
func _Log1mexp(a float64) float64 {
if a > 0 {
log.Panic("Log1mexp: a > 0")
}
if a == 0 {
return 0
}
return (math.Log(-math.Expm1(a)))
}
func _Logdiffexp(a, b float64) float64 {
if a < b {
log.Panic("Log1mexp: a < b")
}
if a == b {
return math.Inf(-1)
}
return a + _Log1mexp(b-a)
}
// _MatchScoreRatio calculates the match score ratio between two bytes. // _MatchScoreRatio calculates the match score ratio between two bytes.
// //
// Parameters: // Parameters:
@@ -83,25 +107,25 @@ func _Logaddexp(a, b float64) float64 {
// Returns: // Returns:
// - float64: the match score ratio when a match is observed // - float64: the match score ratio when a match is observed
// - float64: the match score ratio when a mismatch is observed // - float64: the match score ratio when a mismatch is observed
func _MatchScoreRatio(a, b byte) (float64, float64) { func _MatchScoreRatio(QF, QR byte) (float64, float64) {
l2 := math.Log(2)
l3 := math.Log(3) l3 := math.Log(3)
l4 := math.Log(4)
l10 := math.Log(10) l10 := math.Log(10)
lalea := math.Log(4) // 1 /(change of the random model) qF := -float64(QF) / 10 * l10
lE1 := -float64(a)/10*l10 - l3 // log proba of sequencing error on A/3 qR := -float64(QR) / 10 * l10
lE2 := -float64(b)/10*l10 - l3 // log proba of sequencing error on B/3 term1 := _Logaddexp(qF, qR)
lO1 := math.Log1p(-math.Exp(lE1 + l3)) // log proba no being an error on A term2 := _Logdiffexp(term1, qF+qR)
lO2 := math.Log1p(-math.Exp(lE2 + l3)) // log proba no being an error on B
lO1O2 := lO1 + lO2
lE1E2 := lE1 + lE2
lO1E2 := lO1 + lE2
lO2E1 := lO2 + lE1
MM := _Logaddexp(lO1O2, lE1E2+l3) // Proba match when match observed // log.Warnf("MatchScoreRatio: %v, %v , %v, %v", QF, QR, term1, term2)
Mm := _Logaddexp(_Logaddexp(lO1E2, lO2E1), lE1E2+l2) // Proba match when mismatch observed
return MM + lalea, Mm + lalea match_logp := _Log1mexp(term2 + l3 - l4)
match_score := match_logp - _Log1mexp(match_logp)
mismatch_logp := term2 - l4
mismatch_score := mismatch_logp - _Log1mexp(mismatch_logp)
return match_score, mismatch_score
} }
func _InitNucPartMatch() { func _InitNucPartMatch() {

View File

@@ -21,15 +21,15 @@ func encodeValues(score, length int, out bool) uint64 {
return fo return fo
} }
func _isout(value uint64) bool { // func _isout(value uint64) bool {
const outmask = uint64(1) << dwsize // const outmask = uint64(1) << dwsize
return (value & outmask) == 0 // return (value & outmask) == 0
} // }
func _lpath(value uint64) int { // func _lpath(value uint64) int {
const mask = uint64(1<<wsize) - 1 // const mask = uint64(1<<wsize) - 1
return int(((value + 1) ^ mask) & mask) // return int(((value + 1) ^ mask) & mask)
} // }
func decodeValues(value uint64) (int, int, bool) { func decodeValues(value uint64) (int, int, bool) {
const mask = uint64(1<<wsize) - 1 const mask = uint64(1<<wsize) - 1
@@ -57,4 +57,3 @@ func _setout(value uint64) uint64 {
var _empty = encodeValues(0, 0, false) var _empty = encodeValues(0, 0, false)
var _out = encodeValues(0, 30000, true) var _out = encodeValues(0, 30000, true)
var _notavail = encodeValues(0, 30000, false) var _notavail = encodeValues(0, 30000, false)

View File

@@ -625,6 +625,8 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
&arena.pointer.scoreMatrix, &arena.pointer.scoreMatrix,
&arena.pointer.pathMatrix) &arena.pointer.pathMatrix)
score = scoreR
path = _Backtracking(arena.pointer.pathMatrix, path = _Backtracking(arena.pointer.pathMatrix,
len(rawSeqA), len(rawSeqB), len(rawSeqA), len(rawSeqB),
&(arena.pointer.path)) &(arena.pointer.path))
@@ -641,6 +643,7 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
len(rawSeqA), len(rawSeqB), len(rawSeqA), len(rawSeqB),
&(arena.pointer.path)) &(arena.pointer.path))
isLeftAlign = true isLeftAlign = true
score = scoreL
} }
} }

View File

@@ -2,6 +2,8 @@ package obidefault
var __taxonomy__ = "" var __taxonomy__ = ""
var __alternative_name__ = false var __alternative_name__ = false
var __fail_on_taxonomy__ = false
var __update_taxid__ = false
func SelectedTaxonomy() string { func SelectedTaxonomy() string {
return __taxonomy__ return __taxonomy__
@@ -30,3 +32,27 @@ func SetSelectedTaxonomy(taxonomy string) {
func SetAlternativeNamesSelected(alt bool) { func SetAlternativeNamesSelected(alt bool) {
__alternative_name__ = alt __alternative_name__ = alt
} }
func SetFailOnTaxonomy(fail bool) {
__fail_on_taxonomy__ = fail
}
func SetUpdateTaxid(update bool) {
__update_taxid__ = update
}
func FailOnTaxonomyPtr() *bool {
return &__fail_on_taxonomy__
}
func UpdateTaxidPtr() *bool {
return &__update_taxid__
}
func FailOnTaxonomy() bool {
return __fail_on_taxonomy__
}
func UpdateTaxid() bool {
return __update_taxid__
}

View File

@@ -9,12 +9,11 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/buger/jsonparser" "github.com/buger/jsonparser"
) )
func _parse_json_map_string(str []byte, sequence *obiseq.BioSequence) (map[string]string, error) { func _parse_json_map_string(str []byte) (map[string]string, error) {
values := make(map[string]string) values := make(map[string]string)
jsonparser.ObjectEach(str, jsonparser.ObjectEach(str,
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
@@ -26,7 +25,7 @@ func _parse_json_map_string(str []byte, sequence *obiseq.BioSequence) (map[strin
return values, nil return values, nil
} }
func _parse_json_map_int(str []byte, sequence *obiseq.BioSequence) (map[string]int, error) { func _parse_json_map_int(str []byte) (map[string]int, error) {
values := make(map[string]int) values := make(map[string]int)
jsonparser.ObjectEach(str, jsonparser.ObjectEach(str,
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
@@ -42,7 +41,7 @@ func _parse_json_map_int(str []byte, sequence *obiseq.BioSequence) (map[string]i
return values, nil return values, nil
} }
func _parse_json_map_float(str []byte, sequence *obiseq.BioSequence) (map[string]float64, error) { func _parse_json_map_float(str []byte) (map[string]float64, error) {
values := make(map[string]float64) values := make(map[string]float64)
jsonparser.ObjectEach(str, jsonparser.ObjectEach(str,
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
@@ -58,7 +57,7 @@ func _parse_json_map_float(str []byte, sequence *obiseq.BioSequence) (map[string
return values, nil return values, nil
} }
func _parse_json_map_bool(str []byte, sequence *obiseq.BioSequence) (map[string]bool, error) { func _parse_json_map_bool(str []byte) (map[string]bool, error) {
values := make(map[string]bool) values := make(map[string]bool)
jsonparser.ObjectEach(str, jsonparser.ObjectEach(str,
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
@@ -74,7 +73,7 @@ func _parse_json_map_bool(str []byte, sequence *obiseq.BioSequence) (map[string]
return values, nil return values, nil
} }
func _parse_json_map_interface(str []byte, sequence *obiseq.BioSequence) (map[string]interface{}, error) { func _parse_json_map_interface(str []byte) (map[string]interface{}, error) {
values := make(map[string]interface{}) values := make(map[string]interface{})
jsonparser.ObjectEach(str, jsonparser.ObjectEach(str,
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) { func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
@@ -101,7 +100,7 @@ func _parse_json_map_interface(str []byte, sequence *obiseq.BioSequence) (map[st
return values, nil return values, nil
} }
func _parse_json_array_string(str []byte, sequence *obiseq.BioSequence) ([]string, error) { func _parse_json_array_string(str []byte) ([]string, error) {
values := make([]string, 0) values := make([]string, 0)
jsonparser.ArrayEach(str, jsonparser.ArrayEach(str,
func(value []byte, dataType jsonparser.ValueType, offset int, err error) { func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
@@ -163,7 +162,7 @@ func _parse_json_array_bool(str []byte, sequence *obiseq.BioSequence) ([]bool, e
return values, nil return values, nil
} }
func _parse_json_array_interface(str []byte, sequence *obiseq.BioSequence) ([]interface{}, error) { func _parse_json_array_interface(str []byte) ([]interface{}, error) {
values := make([]interface{}, 0) values := make([]interface{}, 0)
jsonparser.ArrayEach(str, jsonparser.ArrayEach(str,
func(value []byte, dataType jsonparser.ValueType, offset int, err error) { func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
@@ -201,8 +200,6 @@ func _parse_json_array_interface(str []byte, sequence *obiseq.BioSequence) ([]in
} }
func _parse_json_header_(header string, sequence *obiseq.BioSequence) string { func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
taxonomy := obitax.DefaultTaxonomy()
annotations := sequence.Annotations() annotations := sequence.Annotations()
start := -1 start := -1
stop := -1 stop := -1
@@ -264,14 +261,14 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
sequence.SetCount(int(count)) sequence.SetCount(int(count))
case skey == "obiclean_weight": case skey == "obiclean_weight":
weight, err := _parse_json_map_int(value, sequence) weight, err := _parse_json_map_int(value)
if err != nil { if err != nil {
log.Fatalf("%s: Cannot parse obiclean weight %s", sequence.Id(), string(value)) log.Fatalf("%s: Cannot parse obiclean weight %s", sequence.Id(), string(value))
} }
annotations[skey] = weight annotations[skey] = weight
case skey == "obiclean_status": case skey == "obiclean_status":
status, err := _parse_json_map_string(value, sequence) status, err := _parse_json_map_string(value)
if err != nil { if err != nil {
log.Fatalf("%s: Cannot parse obiclean status %s", sequence.Id(), string(value)) log.Fatalf("%s: Cannot parse obiclean status %s", sequence.Id(), string(value))
} }
@@ -279,7 +276,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
case strings.HasPrefix(skey, "merged_"): case strings.HasPrefix(skey, "merged_"):
if dataType == jsonparser.Object { if dataType == jsonparser.Object {
data, err := _parse_json_map_int(value, sequence) data, err := _parse_json_map_int(value)
if err != nil { if err != nil {
log.Fatalf("%s: Cannot parse merged slot %s: %v", sequence.Id(), skey, err) log.Fatalf("%s: Cannot parse merged slot %s: %v", sequence.Id(), skey, err)
} else { } else {
@@ -291,13 +288,8 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
case skey == "taxid": case skey == "taxid":
if dataType == jsonparser.Number || dataType == jsonparser.String { if dataType == jsonparser.Number || dataType == jsonparser.String {
taxid := obiutils.UnsafeString(value) taxid := string(value)
taxon := taxonomy.Taxon(taxid) sequence.SetTaxid(taxid)
if taxon != nil {
sequence.SetTaxon(taxon)
} else {
sequence.SetTaxid(string(value))
}
} else { } else {
log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value)) log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value))
} }
@@ -306,15 +298,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
if dataType == jsonparser.Number || dataType == jsonparser.String { if dataType == jsonparser.Number || dataType == jsonparser.String {
rank, _ := obiutils.SplitInTwo(skey, '_') rank, _ := obiutils.SplitInTwo(skey, '_')
taxid := obiutils.UnsafeString(value) taxid := string(value)
taxon := taxonomy.Taxon(taxid)
if taxon != nil {
taxid = taxon.String()
} else {
taxid = string(value)
}
sequence.SetTaxid(taxid, rank) sequence.SetTaxid(taxid, rank)
} else { } else {
log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value)) log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value))
@@ -332,9 +316,9 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
annotations[skey], err = strconv.ParseFloat(obiutils.UnsafeString(value), 64) annotations[skey], err = strconv.ParseFloat(obiutils.UnsafeString(value), 64)
} }
case jsonparser.Array: case jsonparser.Array:
annotations[skey], err = _parse_json_array_interface(value, sequence) annotations[skey], err = _parse_json_array_interface(value)
case jsonparser.Object: case jsonparser.Object:
annotations[skey], err = _parse_json_map_interface(value, sequence) annotations[skey], err = _parse_json_map_interface(value)
case jsonparser.Boolean: case jsonparser.Boolean:
annotations[skey], err = jsonparser.ParseBoolean(value) annotations[skey], err = jsonparser.ParseBoolean(value)
case jsonparser.Null: case jsonparser.Null:

View File

@@ -72,7 +72,7 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
} }
fastqDetector := func(raw []byte, limit uint32) bool { fastqDetector := func(raw []byte, limit uint32) bool {
ok, err := regexp.Match("^@[^ ].*\n[^ ]+\n\\+", raw) ok, err := regexp.Match("^@[^ ].*\n[A-Za-z.-]+", raw)
return ok && err == nil return ok && err == nil
} }

View File

@@ -0,0 +1,18 @@
package obiiter
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
func (iterator *IBioSequence) ExtractTaxonomy() (taxonomy *obitax.Taxonomy, err error) {
for iterator.Next() {
slice := iterator.Get().Slice()
taxonomy, err = slice.ExtractTaxonomy(taxonomy)
if err != nil {
return
}
}
return
}

View File

@@ -19,7 +19,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
newiter.WaitAndClose() newiter.WaitAndClose()
}() }()
f := func(iterator IBioSequence, id int) { f := func(iterator IBioSequence) {
source := "" source := ""
for iterator.Next() { for iterator.Next() {
news := obiseq.MakeBioSequenceSlice() news := obiseq.MakeBioSequenceSlice()
@@ -66,9 +66,9 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
} }
for i := 1; i < nworkers; i++ { for i := 1; i < nworkers; i++ {
go f(iterator.Split(), i) go f(iterator.Split())
} }
go f(iterator, 0) go f(iterator)
return newiter.SortBatches().Rebatch(size) return newiter.SortBatches().Rebatch(size)
} }

View File

@@ -2,6 +2,7 @@ package obikmer
import ( import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
) )
var __single_base_code__ = []byte{0, var __single_base_code__ = []byte{0,
@@ -131,33 +132,39 @@ func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obise
maxshift := 0 maxshift := 0
maxcount := 0 maxcount := 0
maxscore := -1.0 maxscore := -1.0
maxrelscore := -1.0
for shift, count := range *shifts { for shift, count := range *shifts {
delete((*shifts), shift) delete((*shifts), shift)
score := float64(count) selectscore := float64(count)
if relscore { relativescore := float64(count)
over := -shift over := -shift
switch { switch {
case shift > 0: case shift > 0:
over += lindex over += lindex
case shift < 0: case shift < 0:
over = seq.Len() - over over = seq.Len() - over
default: default:
over = min(lindex, seq.Len()) over = min(lindex, seq.Len())
}
score = score / float64(over-3)
} }
if score > maxscore { relativescore = relativescore / float64(over-3)
if relscore {
selectscore = relativescore
}
if selectscore > maxscore {
maxshift = shift maxshift = shift
maxcount = count maxcount = count
maxscore = score maxscore = selectscore
maxrelscore = relativescore
} else { } else {
if score == maxscore && shift < maxshift { if selectscore == maxscore && obiutils.Abs(shift) < obiutils.Abs(maxshift) {
maxshift = shift maxshift = shift
maxcount = count maxcount = count
maxrelscore = relativescore
} }
} }
} }
return maxshift, maxcount, maxscore return maxshift, maxcount, maxrelscore
} }

View File

@@ -4,4 +4,5 @@ import lua "github.com/yuin/gopher-lua"
func RegisterObilib(luaState *lua.LState) { func RegisterObilib(luaState *lua.LState) {
RegisterObiSeq(luaState) RegisterObiSeq(luaState)
RegisterObiTaxonomy(luaState)
} }

View File

@@ -1,7 +1,9 @@
package obilua package obilua
import ( import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
lua "github.com/yuin/gopher-lua" lua "github.com/yuin/gopher-lua"
) )
@@ -16,6 +18,7 @@ func registerBioSequenceType(luaState *lua.LState) {
bioSequenceType := luaState.NewTypeMetatable(luaBioSequenceTypeName) bioSequenceType := luaState.NewTypeMetatable(luaBioSequenceTypeName)
luaState.SetGlobal(luaBioSequenceTypeName, bioSequenceType) luaState.SetGlobal(luaBioSequenceTypeName, bioSequenceType)
luaState.SetField(bioSequenceType, "new", luaState.NewFunction(newObiSeq)) luaState.SetField(bioSequenceType, "new", luaState.NewFunction(newObiSeq))
luaState.SetField(bioSequenceType, "nil", obiseq2Lua(luaState, nil))
luaState.SetField(bioSequenceType, "__index", luaState.SetField(bioSequenceType, "__index",
luaState.SetFuncs(luaState.NewTable(), luaState.SetFuncs(luaState.NewTable(),
@@ -53,6 +56,7 @@ var bioSequenceMethods = map[string]lua.LGFunction{
"definition": bioSequenceGetSetDefinition, "definition": bioSequenceGetSetDefinition,
"count": bioSequenceGetSetCount, "count": bioSequenceGetSetCount,
"taxid": bioSequenceGetSetTaxid, "taxid": bioSequenceGetSetTaxid,
"taxon": bioSequenceGetSetTaxon,
"attribute": bioSequenceGetSetAttribute, "attribute": bioSequenceGetSetAttribute,
"len": bioSequenceGetLength, "len": bioSequenceGetLength,
"has_sequence": bioSequenceHasSequence, "has_sequence": bioSequenceHasSequence,
@@ -62,6 +66,9 @@ var bioSequenceMethods = map[string]lua.LGFunction{
"md5_string": bioSequenceGetMD5String, "md5_string": bioSequenceGetMD5String,
"subsequence": bioSequenceGetSubsequence, "subsequence": bioSequenceGetSubsequence,
"reverse_complement": bioSequenceGetRevcomp, "reverse_complement": bioSequenceGetRevcomp,
"fasta": bioSequenceGetFasta,
"fastq": bioSequenceGetFastq,
"string": bioSequenceAsString,
} }
// checkBioSequence checks if the first argument in the Lua stack is a *obiseq.BioSequence. // checkBioSequence checks if the first argument in the Lua stack is a *obiseq.BioSequence.
@@ -254,3 +261,88 @@ func bioSequenceGetRevcomp(luaState *lua.LState) int {
luaState.Push(obiseq2Lua(luaState, revcomp)) luaState.Push(obiseq2Lua(luaState, revcomp))
return 1 return 1
} }
func bioSequenceGetSetTaxon(luaState *lua.LState) int {
s := checkBioSequence(luaState)
if luaState.GetTop() > 1 {
taxon := checkTaxon(luaState, 2)
s.SetTaxon(taxon)
return 0
}
taxon := s.Taxon(obitax.DefaultTaxonomy())
luaState.Push(taxon2Lua(luaState, taxon))
return 1
}
func bioSequenceGetFasta(luaState *lua.LState) int {
s := checkBioSequence(luaState)
formater := obiformats.FormatFastSeqJsonHeader
if luaState.GetTop() > 1 {
format := luaState.CheckString(2)
switch format {
case "json":
formater = obiformats.FormatFastSeqJsonHeader
case "obi":
formater = obiformats.FormatFastSeqOBIHeader
}
}
txt := obiformats.FormatFasta(s, formater)
luaState.Push(lua.LString(txt))
return 1
}
func bioSequenceGetFastq(luaState *lua.LState) int {
s := checkBioSequence(luaState)
formater := obiformats.FormatFastSeqJsonHeader
if luaState.GetTop() > 1 {
format := luaState.CheckString(2)
switch format {
case "json":
formater = obiformats.FormatFastSeqJsonHeader
case "obi":
formater = obiformats.FormatFastSeqOBIHeader
}
}
txt := obiformats.FormatFastq(s, formater)
luaState.Push(lua.LString(txt))
return 1
}
func bioSequenceAsString(luaState *lua.LState) int {
s := checkBioSequence(luaState)
formater := obiformats.FormatFastSeqJsonHeader
format := obiformats.FormatFasta
if s.HasQualities() {
format = obiformats.FormatFastq
}
if luaState.GetTop() > 1 {
format := luaState.CheckString(2)
switch format {
case "json":
formater = obiformats.FormatFastSeqJsonHeader
case "obi":
formater = obiformats.FormatFastSeqOBIHeader
}
}
txt := format(s, formater)
luaState.Push(lua.LString(txt))
return 1
}

View File

@@ -1,6 +1,9 @@
package obilua package obilua
import ( import (
"strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
lua "github.com/yuin/gopher-lua" lua "github.com/yuin/gopher-lua"
) )
@@ -11,6 +14,7 @@ func registerBioSequenceSliceType(luaState *lua.LState) {
bioSequenceSliceType := luaState.NewTypeMetatable(luaBioSequenceSliceTypeName) bioSequenceSliceType := luaState.NewTypeMetatable(luaBioSequenceSliceTypeName)
luaState.SetGlobal(luaBioSequenceSliceTypeName, bioSequenceSliceType) luaState.SetGlobal(luaBioSequenceSliceTypeName, bioSequenceSliceType)
luaState.SetField(bioSequenceSliceType, "new", luaState.NewFunction(newObiSeqSlice)) luaState.SetField(bioSequenceSliceType, "new", luaState.NewFunction(newObiSeqSlice))
luaState.SetField(bioSequenceSliceType, "nil", obiseqslice2Lua(luaState, nil))
luaState.SetField(bioSequenceSliceType, "__index", luaState.SetField(bioSequenceSliceType, "__index",
luaState.SetFuncs(luaState.NewTable(), luaState.SetFuncs(luaState.NewTable(),
@@ -37,6 +41,9 @@ var bioSequenceSliceMethods = map[string]lua.LGFunction{
"pop": bioSequenceSlicePop, "pop": bioSequenceSlicePop,
"sequence": bioSequenceSliceGetSetSequence, "sequence": bioSequenceSliceGetSetSequence,
"len": bioSequenceSliceGetLength, "len": bioSequenceSliceGetLength,
"fasta": bioSequenceSliceGetFasta,
"fastq": bioSequenceSliceGetFastq,
"string": bioSequenceSliceAsString,
} }
func checkBioSequenceSlice(L *lua.LState) *obiseq.BioSequenceSlice { func checkBioSequenceSlice(L *lua.LState) *obiseq.BioSequenceSlice {
@@ -105,3 +112,96 @@ func bioSequenceSlicePop(luaState *lua.LState) int {
return 1 return 1
} }
func bioSequenceSliceGetFasta(luaState *lua.LState) int {
s := checkBioSequenceSlice(luaState)
formater := obiformats.FormatFastSeqJsonHeader
if luaState.GetTop() > 1 {
format := luaState.CheckString(2)
switch format {
case "json":
formater = obiformats.FormatFastSeqJsonHeader
case "obi":
formater = obiformats.FormatFastSeqOBIHeader
}
}
txts := make([]string, len(*s))
for i, seq := range *s {
txts[i] = obiformats.FormatFasta(seq, formater)
}
txt := strings.Join(txts, "\n")
luaState.Push(lua.LString(txt))
return 1
}
func bioSequenceSliceGetFastq(luaState *lua.LState) int {
s := checkBioSequenceSlice(luaState)
formater := obiformats.FormatFastSeqJsonHeader
if luaState.GetTop() > 1 {
format := luaState.CheckString(2)
switch format {
case "json":
formater = obiformats.FormatFastSeqJsonHeader
case "obi":
formater = obiformats.FormatFastSeqOBIHeader
}
}
txts := make([]string, len(*s))
for i, seq := range *s {
txts[i] = obiformats.FormatFastq(seq, formater)
}
txt := strings.Join(txts, "\n")
luaState.Push(lua.LString(txt))
return 1
}
func bioSequenceSliceAsString(luaState *lua.LState) int {
s := checkBioSequenceSlice(luaState)
formater := obiformats.FormatFastSeqJsonHeader
if luaState.GetTop() > 1 {
format := luaState.CheckString(2)
switch format {
case "json":
formater = obiformats.FormatFastSeqJsonHeader
case "obi":
formater = obiformats.FormatFastSeqOBIHeader
}
}
txts := make([]string, len(*s))
format := obiformats.FormatFasta
allQual := true
for _, s := range *s {
allQual = allQual && s.HasQualities()
}
if allQual {
format = obiformats.FormatFastq
}
for i, seq := range *s {
txts[i] = format(seq, formater)
}
txt := strings.Join(txts, "\n")
luaState.Push(lua.LString(txt))
return 1
}

139
pkg/obilua/obitaxon.go Normal file
View File

@@ -0,0 +1,139 @@
package obilua
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
lua "github.com/yuin/gopher-lua"
)
const luaTaxonTypeName = "Taxon"
func registerTaxonType(luaState *lua.LState) {
taxonType := luaState.NewTypeMetatable(luaTaxonTypeName)
luaState.SetGlobal(luaTaxonTypeName, taxonType)
luaState.SetField(taxonType, "new", luaState.NewFunction(newTaxon))
luaState.SetField(taxonType, "nil", taxonomy2Lua(luaState, nil))
luaState.SetField(taxonType, "__index",
luaState.SetFuncs(luaState.NewTable(),
taxonMethods))
}
func taxon2Lua(interpreter *lua.LState,
taxon *obitax.Taxon) lua.LValue {
ud := interpreter.NewUserData()
ud.Value = taxon
interpreter.SetMetatable(ud, interpreter.GetTypeMetatable(luaTaxonTypeName))
return ud
}
func newTaxon(luaState *lua.LState) int {
taxonomy := checkTaxonomy(luaState)
taxid := luaState.CheckString(2)
parent := luaState.CheckString(3)
sname := luaState.CheckString(4)
rank := luaState.CheckString(5)
isroot := false
if luaState.GetTop() > 5 {
isroot = luaState.CheckBool(6)
}
taxon, err := taxonomy.AddTaxon(taxid, parent, rank, isroot, false)
if err != nil {
luaState.RaiseError("(%v,%v,%v) : Error on taxon creation: %v", taxid, parent, sname, err)
return 0
}
taxon.SetName(sname, "scientific name")
luaState.Push(taxon2Lua(luaState, taxon))
return 1
}
var taxonMethods = map[string]lua.LGFunction{
"string": taxonAsString,
"scientific_name": taxonGetSetScientificName,
"parent": taxonGetParent,
"taxon_at_rank": taxGetTaxonAtRank,
"species": taxonGetSpecies,
"genus": taxonGetGenus,
"family": taxonGetFamily,
}
func checkTaxon(L *lua.LState, i int) *obitax.Taxon {
ud := L.CheckUserData(i)
if v, ok := ud.Value.(*obitax.Taxon); ok {
return v
}
L.ArgError(i, "obitax.Taxon expected")
return nil
}
func taxonAsString(luaState *lua.LState) int {
taxon := checkTaxon(luaState, 1)
luaState.Push(lua.LString(taxon.String()))
return 1
}
func taxonGetSetScientificName(luaState *lua.LState) int {
taxon := checkTaxon(luaState, 1)
if luaState.GetTop() > 1 {
sname := luaState.CheckString(2)
taxon.SetName(sname, "scientific name")
return 0
}
luaState.Push(lua.LString(taxon.ScientificName()))
return 1
}
func taxonGetParent(luaState *lua.LState) int {
taxon := checkTaxon(luaState, 1)
parent := taxon.Parent()
luaState.Push(taxon2Lua(luaState, parent))
return 1
}
func taxonGetSpecies(luaState *lua.LState) int {
taxon := checkTaxon(luaState, 1)
species := taxon.Species()
luaState.Push(taxon2Lua(luaState, species))
return 1
}
func taxonGetGenus(luaState *lua.LState) int {
taxon := checkTaxon(luaState, 1)
genus := taxon.Genus()
luaState.Push(taxon2Lua(luaState, genus))
return 1
}
func taxonGetFamily(luaState *lua.LState) int {
taxon := checkTaxon(luaState, 1)
family := taxon.Family()
luaState.Push(taxon2Lua(luaState, family))
return 1
}
func taxGetTaxonAtRank(luaState *lua.LState) int {
taxon := checkTaxon(luaState, 1)
rank := luaState.CheckString(2)
taxonAt := taxon.TaxonAtRank(rank)
luaState.Push(taxon2Lua(luaState, taxonAt))
return 1
}

116
pkg/obilua/obitaxonomy.go Normal file
View File

@@ -0,0 +1,116 @@
package obilua
import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
lua "github.com/yuin/gopher-lua"
)
func RegisterObiTaxonomy(luaState *lua.LState) {
registerTaxonomyType(luaState)
registerTaxonType(luaState)
}
const luaTaxonomyTypeName = "Taxonomy"
func registerTaxonomyType(luaState *lua.LState) {
taxonomyType := luaState.NewTypeMetatable(luaTaxonomyTypeName)
luaState.SetGlobal(luaTaxonomyTypeName, taxonomyType)
luaState.SetField(taxonomyType, "new", luaState.NewFunction(newTaxonomy))
luaState.SetField(taxonomyType, "default", luaState.NewFunction(defaultTaxonomy))
luaState.SetField(taxonomyType, "has_default", luaState.NewFunction(hasDefaultTaxonomy))
luaState.SetField(taxonomyType, "nil", taxon2Lua(luaState, nil))
luaState.SetField(taxonomyType, "__index",
luaState.SetFuncs(luaState.NewTable(),
taxonomyMethods))
}
func taxonomy2Lua(interpreter *lua.LState,
taxonomy *obitax.Taxonomy) lua.LValue {
ud := interpreter.NewUserData()
ud.Value = taxonomy
interpreter.SetMetatable(ud, interpreter.GetTypeMetatable(luaTaxonomyTypeName))
return ud
}
func newTaxonomy(luaState *lua.LState) int {
name := luaState.CheckString(1)
code := luaState.CheckString(2)
charset := obiutils.AsciiAlphaNumSet
if luaState.GetTop() > 2 {
charset = obiutils.AsciiSetFromString(luaState.CheckString(3))
}
taxonomy := obitax.NewTaxonomy(name, code, charset)
luaState.Push(taxonomy2Lua(luaState, taxonomy))
return 1
}
func defaultTaxonomy(luaState *lua.LState) int {
taxonomy := obitax.DefaultTaxonomy()
if taxonomy == nil {
luaState.RaiseError("No default taxonomy")
return 0
}
luaState.Push(taxonomy2Lua(luaState, taxonomy))
return 1
}
func hasDefaultTaxonomy(luaState *lua.LState) int {
taxonomy := obitax.DefaultTaxonomy()
luaState.Push(lua.LBool(taxonomy != nil))
return 1
}
var taxonomyMethods = map[string]lua.LGFunction{
"name": taxonomyGetName,
"code": taxonomyGetCode,
"taxon": taxonomyGetTaxon,
}
func checkTaxonomy(L *lua.LState) *obitax.Taxonomy {
ud := L.CheckUserData(1)
if v, ok := ud.Value.(*obitax.Taxonomy); ok {
return v
}
L.ArgError(1, "obitax.Taxonomy expected")
return nil
}
func taxonomyGetName(luaState *lua.LState) int {
taxo := checkTaxonomy(luaState)
luaState.Push(lua.LString(taxo.Name()))
return 1
}
func taxonomyGetCode(luaState *lua.LState) int {
taxo := checkTaxonomy(luaState)
luaState.Push(lua.LString(taxo.Code()))
return 1
}
func taxonomyGetTaxon(luaState *lua.LState) int {
taxo := checkTaxonomy(luaState)
taxid := luaState.CheckString(2)
taxon, isAlias, err := taxo.Taxon(taxid)
if err != nil {
luaState.RaiseError("%s : Error on taxon taxon: %v", taxid, err)
return 0
}
if isAlias && obidefault.FailOnTaxonomy() {
luaState.RaiseError("%s : Taxon is an alias of %s", taxid, taxon.String())
return 0
}
luaState.Push(taxon2Lua(luaState, taxon))
return 1
}

View File

@@ -66,10 +66,6 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
options.GetEnv("OBISOLEXA"), options.GetEnv("OBISOLEXA"),
options.Description("Decodes quality string according to the Solexa specification.")) options.Description("Decodes quality string according to the Solexa specification."))
options.BoolVar(obidefault.CompressedPtr(), "compressed", obidefault.CompressOutput(),
options.Alias("Z"),
options.Description("Compress all the result using gzip"))
for _, o := range optionset { for _, o := range optionset {
o(options) o(options)
} }
@@ -181,6 +177,15 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo
options.Alias("a"), options.Alias("a"),
options.Description("Enable the search on all alternative names and not only scientific names.")) options.Description("Enable the search on all alternative names and not only scientific names."))
} }
options.BoolVar(obidefault.FailOnTaxonomyPtr(), "fail-on-taxonomy",
obidefault.FailOnTaxonomy(),
options.Description("Make obitools failing on error if a used taxid is not a currently valid one"),
)
options.BoolVar(obidefault.UpdateTaxidPtr(), "update-taxid", obidefault.UpdateTaxid(),
options.Description("Make obitools automatically updating the taxid that are declared merged to a newest one."),
)
} }
// CLIIsDebugMode returns whether the CLI is in debug mode. // CLIIsDebugMode returns whether the CLI is in debug mode.

View File

@@ -8,7 +8,7 @@ import (
// corresponds to the last commit, and not the one when the file will be // corresponds to the last commit, and not the one when the file will be
// commited // commited
var _Commit = "c50a0f4" var _Commit = "573acaf"
var _Version = "Release 4.2.0" var _Version = "Release 4.2.0"
// Version returns the version of the obitools package. // Version returns the version of the obitools package.

View File

@@ -1,6 +1,7 @@
package obiseq package obiseq
import ( import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"golang.org/x/exp/slices" "golang.org/x/exp/slices"
@@ -179,3 +180,18 @@ func (s *BioSequenceSlice) SortOnLength(reverse bool) {
} }
}) })
} }
func (s *BioSequenceSlice) ExtractTaxonomy(taxonomy *obitax.Taxonomy) (*obitax.Taxonomy, error) {
var err error
for _, s := range *s {
taxonomy, err = taxonomy.InsertPathString(s.Path())
if err != nil {
return nil, err
}
}
return taxonomy, nil
}

View File

@@ -196,6 +196,16 @@ func IsShorterOrEqualTo(length int) SequencePredicate {
return f return f
} }
func OccurInAtleast(sample string, n int) SequencePredicate {
desc := MakeStatsOnDescription(sample)
f := func(sequence *BioSequence) bool {
stats := sequence.StatsOn(desc, "NA")
return len(stats) >= n
}
return f
}
func IsSequenceMatch(pattern string) SequencePredicate { func IsSequenceMatch(pattern string) SequencePredicate {
pat, err := regexp.Compile("(?i)" + pattern) pat, err := regexp.Compile("(?i)" + pattern)

View File

@@ -31,7 +31,7 @@ func TaxonomyClassifier(taxonomicRank string,
if taxon != nil { if taxon != nil {
ttaxon := taxon.TaxonAtRank(taxonomicRank) ttaxon := taxon.TaxonAtRank(taxonomicRank)
if abortOnMissing && ttaxon == nil { if abortOnMissing && ttaxon == nil {
log.Fatalf("Taxon at rank %s not found in taxonomy for taxid %d", taxonomicRank, taxon.String()) log.Fatalf("Taxon at rank %s not found in taxonomy for taxid %s", taxonomicRank, taxon.String())
} }
} else { } else {
if abortOnMissing { if abortOnMissing {

View File

@@ -4,6 +4,7 @@ import (
"math" "math"
"strings" "strings"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
@@ -15,13 +16,20 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma
taxonomy = taxonomy.OrDefault(true) taxonomy = taxonomy.OrDefault(true)
for taxid, v := range taxids { for taxid, v := range taxids {
t := taxonomy.Taxon(taxid) t, isAlias, err := taxonomy.Taxon(taxid)
if t == nil { if err != nil {
log.Fatalf( log.Fatalf(
"On sequence %s taxid %s is not defined in taxonomy: %s", "On sequence %s taxid %s is not defined in taxonomy: %s (%v)",
sequence.Id(), sequence.Id(),
taxid, taxid,
taxonomy.Name()) taxonomy.Name(),
err,
)
}
if isAlias && obidefault.FailOnTaxonomy() {
log.Fatalf("On sequence %s taxid %s is an alias on %s",
sequence.Id(), taxid, t.String())
} }
taxons[t.Node] = v taxons[t.Node] = v
} }

View File

@@ -5,7 +5,9 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
) )
func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon { func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
@@ -14,7 +16,10 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
if taxid == "NA" { if taxid == "NA" {
return nil return nil
} }
return taxonomy.Taxon(taxid)
taxon, _, _ := taxonomy.Taxon(taxid)
return taxon
} }
// SetTaxid sets the taxid for the BioSequence. // SetTaxid sets the taxid for the BioSequence.
@@ -23,6 +28,9 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
// //
// taxid - the taxid to set. // taxid - the taxid to set.
func (s *BioSequence) SetTaxid(taxid string, rank ...string) { func (s *BioSequence) SetTaxid(taxid string, rank ...string) {
var err error
var isAlias bool
if taxid == "" { if taxid == "" {
taxid = "NA" taxid = "NA"
} else { } else {
@@ -30,11 +38,38 @@ func (s *BioSequence) SetTaxid(taxid string, rank ...string) {
taxon := (*obitax.Taxon)(nil) taxon := (*obitax.Taxon)(nil)
if taxonomy != nil { if taxonomy != nil {
taxon = taxonomy.Taxon(taxid) taxon, isAlias, err = taxonomy.Taxon(taxid)
}
if err != nil {
logger := log.Warnf
if obidefault.FailOnTaxonomy() {
logger = log.Fatalf
}
logger("%s: Taxid: %v is unknown from taxonomy (%v)",
s.Id(), taxid, err)
}
if isAlias {
if obidefault.FailOnTaxonomy() {
log.Fatalf("%s: Taxid: %v is an alias from taxonomy (%v) to %s",
s.Id(), taxid, taxonomy.Name(), taxon.String())
} else {
if obidefault.UpdateTaxid() {
log.Warnf("%s: Taxid: %v is updated to %s",
s.Id(), taxid, taxon.String())
taxid = taxon.String()
} else {
log.Warnf("%s: Taxid %v has to be updated to %s",
s.Id(), taxid, taxon.String())
}
}
} else {
if taxon != nil {
taxid = taxon.String()
}
}
if taxon != nil {
taxid = taxon.String()
} }
} }
@@ -135,14 +170,35 @@ func (sequence *BioSequence) SetFamily(taxonomy *obitax.Taxonomy) *obitax.Taxon
return sequence.SetTaxonAtRank(taxonomy, "family") return sequence.SetTaxonAtRank(taxonomy, "family")
} }
func (sequence *BioSequence) SetPath(taxonomy *obitax.Taxonomy) string { func (sequence *BioSequence) SetPath(taxonomy *obitax.Taxonomy) []string {
taxon := sequence.Taxon(taxonomy) taxon := sequence.Taxon(taxonomy)
path := taxon.Path() path := taxon.Path()
spath := make([]string, path.Len())
lpath := path.Len() - 1
tpath := path.String() for i := lpath; i >= 0; i-- {
sequence.SetAttribute("taxonomic_path", tpath) spath[lpath-i] = path.Get(i).String(taxonomy.Code())
}
return tpath sequence.SetAttribute("taxonomic_path", spath)
return spath
}
func (sequence *BioSequence) Path() []string {
path, ok := sequence.GetAttribute("taxonomic_path")
if !ok {
return nil
}
slice, err := obiutils.InterfaceToStringSlice(path)
if err != nil {
log.Fatalf("%s: taxonomic_path has the wrong type (%v)", sequence.Id(), err)
}
return slice
} }
func (sequence *BioSequence) SetScientificName(taxonomy *obitax.Taxonomy) string { func (sequence *BioSequence) SetScientificName(taxonomy *obitax.Taxonomy) string {

View File

@@ -25,7 +25,7 @@ func IsAValidTaxon(taxonomy *obitax.Taxonomy, withAutoCorrection ...bool) Sequen
if autocorrection { if autocorrection {
sequence.SetTaxid(ttaxid) sequence.SetTaxid(ttaxid)
log.Printf( log.Printf(
"Sequence %s : Taxid %d updated with %d", "Sequence %s : Taxid %s updated with %s",
sequence.Id(), sequence.Id(),
taxid, taxid,
ttaxid, ttaxid,
@@ -63,7 +63,12 @@ func IsSubCladeOfSlot(taxonomy *obitax.Taxonomy, key string) SequencePredicate {
val, ok := sequence.GetStringAttribute(key) val, ok := sequence.GetStringAttribute(key)
if ok { if ok {
parent := taxonomy.Taxon(val) parent, _, err := taxonomy.Taxon(val)
if err != nil {
log.Warnf("%s: %s is unkown from the taxonomy (%v)", sequence.Id(), val, err)
}
taxon := sequence.Taxon(taxonomy) taxon := sequence.Taxon(taxonomy)
return parent != nil && taxon != nil && taxon.IsSubCladeOf(parent) return parent != nil && taxon != nil && taxon.IsSubCladeOf(parent)
} }

View File

@@ -1 +1,38 @@
package obitax package obitax
import (
"strings"
"github.com/TuftsBCB/io/newick"
)
func (taxonomy *Taxonomy) Newick() string {
if taxonomy == nil {
return ""
}
iterator := taxonomy.AsTaxonSet().Sort().Iterator()
nodes := make(map[*string]*newick.Tree, taxonomy.Len())
trees := make([]*newick.Tree, 0)
for iterator.Next() {
taxon := iterator.Get()
tree := &newick.Tree{Label: taxon.String()}
nodes[taxon.Node.id] = tree
if parent, ok := nodes[taxon.Parent().Node.id]; ok {
parent.Children = append(parent.Children, *tree)
} else {
trees = append(trees, tree)
}
}
rep := strings.Builder{}
for _, tree := range trees {
rep.WriteString(tree.String())
rep.WriteString("\n")
}
return rep.String()
}

View File

@@ -1,11 +1,14 @@
package obitax package obitax
import ( import (
"sync"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
var __defaut_taxonomy__ *Taxonomy var __defaut_taxonomy__ *Taxonomy
var __defaut_taxonomy_mutex__ sync.Mutex
func (taxonomy *Taxonomy) SetAsDefault() { func (taxonomy *Taxonomy) SetAsDefault() {
log.Infof("Set as default taxonomy %s", taxonomy.Name()) log.Infof("Set as default taxonomy %s", taxonomy.Name())
@@ -32,14 +35,18 @@ func DefaultTaxonomy() *Taxonomy {
var err error var err error
if __defaut_taxonomy__ == nil { if __defaut_taxonomy__ == nil {
if obidefault.HasSelectedTaxonomy() { if obidefault.HasSelectedTaxonomy() {
__defaut_taxonomy__, err = LoadTaxonomy( __defaut_taxonomy_mutex__.Lock()
obidefault.SelectedTaxonomy(), defer __defaut_taxonomy_mutex__.Unlock()
!obidefault.AreAlternativeNamesSelected(), if __defaut_taxonomy__ == nil {
) __defaut_taxonomy__, err = LoadTaxonomy(
obidefault.SelectedTaxonomy(),
!obidefault.AreAlternativeNamesSelected(),
)
if err != nil { if err != nil {
log.Fatalf("Cannot load default taxonomy: %v", err) log.Fatalf("Cannot load default taxonomy: %v", err)
}
} }
} }
} }

View File

@@ -2,7 +2,6 @@ package obitax
import ( import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
) )
// ITaxon represents an iterator for traversing Taxon instances. // ITaxon represents an iterator for traversing Taxon instances.
@@ -195,7 +194,6 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
pushed := true pushed := true
log.Warn(parents)
for pushed { for pushed {
itaxo := taxo.Iterator() itaxo := taxo.Iterator()
pushed = false pushed = false
@@ -218,9 +216,9 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
} }
func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon { func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon {
taxon := taxonomy.Taxon(taxid) taxon, _, err := taxonomy.Taxon(taxid)
if taxon == nil { if err != nil {
return nil return nil
} }

View File

@@ -91,7 +91,13 @@ func loadNameTable(reader io.Reader, taxonomy *Taxonomy, onlysn bool) int {
if !onlysn || classname == "scientific name" { if !onlysn || classname == "scientific name" {
n++ n++
taxonomy.Taxon(taxid).SetName(name, classname) taxon, _, err := taxonomy.Taxon(taxid)
if err != nil {
log.Fatalf("%s: is unknown from the taxonomy", taxid)
}
taxon.SetName(name, classname)
} }
} }
@@ -125,7 +131,7 @@ func loadMergedTable(reader io.Reader, taxonomy *Taxonomy) int {
oldtaxid := strings.TrimSpace(record[0]) oldtaxid := strings.TrimSpace(record[0])
newtaxid := strings.TrimSpace(record[1]) newtaxid := strings.TrimSpace(record[1])
taxonomy.AddAlias(newtaxid, oldtaxid, false) taxonomy.AddAlias(oldtaxid, newtaxid, false)
} }
return n return n
@@ -196,7 +202,11 @@ func LoadNCBITaxDump(directory string, onlysn bool) (*Taxonomy, error) {
n = loadMergedTable(buffered, taxonomy) n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n) log.Printf("%d merged taxa read\n", n)
root := taxonomy.Taxon("1") root, _, err := taxonomy.Taxon("1")
if err != nil {
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
}
taxonomy.SetRoot(root) taxonomy.SetRoot(root)
return taxonomy, nil return taxonomy, nil

View File

@@ -134,7 +134,12 @@ func LoadNCBITarTaxDump(path string, onlysn bool) (*Taxonomy, error) {
n = loadMergedTable(buffered, taxonomy) n = loadMergedTable(buffered, taxonomy)
log.Printf("%d merged taxa read\n", n) log.Printf("%d merged taxa read\n", n)
root := taxonomy.Taxon("1") root, _, err := taxonomy.Taxon("1")
if err != nil {
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
}
taxonomy.SetRoot(root) taxonomy.SetRoot(root)
return taxonomy, nil return taxonomy, nil

View File

@@ -0,0 +1 @@
package obitax

View File

@@ -0,0 +1,64 @@
package obitax
import (
"errors"
"strings"
)
// ParseTaxonString parses a string in the format "code:taxid [scientific name]@rank"
// and returns the individual components. It handles extra whitespace around components.
//
// Parameters:
// - taxonStr: The string to parse in the format "code:taxid [scientific name]@rank"
//
// Returns:
// - code: The taxonomy code
// - taxid: The taxon identifier
// - scientificName: The scientific name (without brackets)
// - rank: The rank
// - error: An error if the string format is invalid
func ParseTaxonString(taxonStr string) (code, taxid, scientificName, rank string, err error) {
// Trim any leading/trailing whitespace from the entire string
taxonStr = strings.TrimSpace(taxonStr)
// Split by '@' to separate rank
parts := strings.Split(taxonStr, "@")
if len(parts) > 2 {
return "", "", "", "", errors.New("invalid format: multiple '@' characters found")
}
mainPart := strings.TrimSpace(parts[0])
if len(parts) == 2 {
rank = strings.TrimSpace(parts[1])
} else {
rank = "no rank"
}
// Find scientific name part (enclosed in square brackets)
startBracket := strings.Index(mainPart, "[")
endBracket := strings.LastIndex(mainPart, "]")
if startBracket == -1 || endBracket == -1 || startBracket > endBracket {
return "", "", "", "", errors.New("invalid format: scientific name must be enclosed in square brackets")
}
// Extract and clean scientific name
scientificName = strings.TrimSpace(mainPart[startBracket+1 : endBracket])
// Process code:taxid part
idPart := strings.TrimSpace(mainPart[:startBracket])
idComponents := strings.Split(idPart, ":")
if len(idComponents) != 2 {
return "", "", "", "", errors.New("invalid format: missing taxonomy code separator ':'")
}
code = strings.TrimSpace(idComponents[0])
taxid = strings.TrimSpace(idComponents[1])
if code == "" || taxid == "" || scientificName == "" {
return "", "", "", "", errors.New("invalid format: code, taxid and scientific name cannot be empty")
}
return code, taxid, scientificName, rank, nil
}

View File

@@ -1,6 +1,7 @@
package obitax package obitax
import ( import (
"errors"
"iter" "iter"
"regexp" "regexp"
@@ -379,3 +380,29 @@ func (taxon *Taxon) SameAs(other *Taxon) bool {
return taxon.Taxonomy == other.Taxonomy && taxon.Node.id == other.Node.id return taxon.Taxonomy == other.Taxonomy && taxon.Node.id == other.Node.id
} }
func (taxon *Taxon) AddChild(child string, replace bool) (*Taxon, error) {
if taxon == nil {
return nil, errors.New("nil taxon")
}
code, taxid, scientific_name, rank, err := ParseTaxonString(child)
if err != nil {
return nil, err
}
if taxon.Taxonomy.code != code {
return nil, errors.New("taxonomy code mismatch")
}
newTaxon, err := taxon.Taxonomy.AddTaxon(taxid, *taxon.Node.id, rank, false, replace)
if err != nil {
return nil, err
}
newTaxon.SetName(scientific_name, "scientific name")
return newTaxon, nil
}

View File

@@ -12,7 +12,6 @@ import (
"fmt" "fmt"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
) )
// Taxonomy represents a hierarchical classification of taxa. // Taxonomy represents a hierarchical classification of taxa.
@@ -130,27 +129,30 @@ func (taxonomy *Taxonomy) TaxidString(id string) (string, error) {
// Returns: // Returns:
// - A pointer to the Taxon instance associated with the provided taxid. // - A pointer to the Taxon instance associated with the provided taxid.
// - If the taxid is unknown, the method will log a fatal error. // - If the taxid is unknown, the method will log a fatal error.
func (taxonomy *Taxonomy) Taxon(taxid string) *Taxon { func (taxonomy *Taxonomy) Taxon(taxid string) (*Taxon, bool, error) {
taxonomy = taxonomy.OrDefault(false) taxonomy = taxonomy.OrDefault(false)
if taxonomy == nil { if taxonomy == nil {
return nil return nil, false, errors.New("cannot extract taxon from nil taxonomy")
} }
id, err := taxonomy.Id(taxid) id, err := taxonomy.Id(taxid)
if err != nil { if err != nil {
log.Fatalf("Taxid %s: %v", taxid, err) return nil, false, fmt.Errorf("Taxid %s: %v", taxid, err)
} }
taxon := taxonomy.nodes.Get(id) taxon := taxonomy.nodes.Get(id)
isAlias := taxon.Node.id != id
if taxon == nil { if taxon == nil {
log.Fatalf("Taxid %s is not part of the taxonomy %s", return nil,
taxid, false,
taxonomy.name) fmt.Errorf("Taxid %s is not part of the taxonomy %s",
taxid,
taxonomy.name)
} }
return taxon return taxon, isAlias, nil
} }
// AsTaxonSet returns the set of taxon nodes contained within the Taxonomy. // AsTaxonSet returns the set of taxon nodes contained within the Taxonomy.
@@ -353,3 +355,63 @@ func (taxonomy *Taxonomy) HasRoot() bool {
taxonomy = taxonomy.OrDefault(false) taxonomy = taxonomy.OrDefault(false)
return taxonomy != nil && taxonomy.root != nil return taxonomy != nil && taxonomy.root != nil
} }
func (taxonomy *Taxonomy) InsertPathString(path []string) (*Taxonomy, error) {
if len(path) == 0 {
return nil, errors.New("path is empty")
}
code, taxid, scientific_name, rank, err := ParseTaxonString(path[0])
if taxonomy == nil {
taxonomy = NewTaxonomy(code, code, obiutils.AsciiAlphaNumSet)
}
if err != nil {
return nil, err
}
if taxonomy.Len() == 0 {
if code != taxonomy.code {
return nil, fmt.Errorf("cannot insert taxon %s into taxonomy %s with code %s",
path[0], taxonomy.name, taxonomy.code)
}
root, err := taxonomy.AddTaxon(taxid, taxid, rank, true, true)
if err != nil {
return nil, err
}
root.SetName(scientific_name, "scientificName")
}
var current *Taxon
current, _, err = taxonomy.Taxon(taxid)
if err != nil {
return nil, err
}
if !current.IsRoot() {
return nil, errors.New("path does not start with a root node")
}
for _, id := range path[1:] {
taxon, _, err := taxonomy.Taxon(id)
if err == nil {
if !current.SameAs(taxon.Parent()) {
return nil, errors.New("path is not consistent with the taxonomy, parent mismatch")
}
current = taxon
} else {
current, err = current.AddChild(id, false)
if err != nil {
return nil, err
}
}
}
return taxonomy, nil
}

View File

@@ -151,7 +151,8 @@ func (set *TaxonSet) Alias(id *string, taxon *Taxon) {
if original == nil { if original == nil {
log.Fatalf("Original taxon %v is not part of taxon set", id) log.Fatalf("Original taxon %v is not part of taxon set", id)
} }
set.set[id] = taxon.Node
set.set[id] = original.Node
set.nalias++ set.nalias++
} }
@@ -196,3 +197,30 @@ func (set *TaxonSet) Contains(id *string) bool {
node := set.Get(id) node := set.Get(id)
return node != nil return node != nil
} }
func (set *TaxonSet) Sort() *TaxonSlice {
if set == nil {
return nil
}
taxonomy := set.Taxonomy()
taxa := taxonomy.NewTaxonSlice(0, set.Len())
parent := make(map[*TaxNode]bool, set.Len())
pushed := true
for pushed {
pushed = false
for _, node := range set.set {
if !parent[node] && (parent[set.Get(node.parent).Node] ||
!set.Contains(node.parent) ||
node == taxonomy.Root().Node) {
pushed = true
taxa.slice = append(taxa.slice, node)
parent[node] = true
}
}
}
return taxa
}

View File

@@ -0,0 +1,126 @@
package obiclean
import (
"fmt"
"sort"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus"
)
func commonPrefix(a, b *obiseq.BioSequence) int {
i := 0
l := min(a.Len(), b.Len())
if l == 0 {
return 0
}
as := a.Sequence()
bs := b.Sequence()
for i < l && as[i] == bs[i] {
i++
}
if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
}
return i
}
func commonSuffix(a, b *obiseq.BioSequence) int {
i := a.Len() - 1
j := b.Len() - 1
if i < 0 || j < 0 {
return 0
}
as := a.Sequence()
bs := b.Sequence()
l := 0
for i >= 0 && j >= 0 && as[i] == bs[j] {
i--
j--
l++
}
if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
}
// log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
return l
}
func AnnotateChimera(samples map[string]*[]*seqPCR) {
w := func(sample string, seqs *[]*seqPCR) {
ls := len(*seqs)
cp := make([]int, ls)
cs := make([]int, ls)
pcrs := make([]*seqPCR, 0, ls)
for _, s := range *seqs {
if len(s.Edges) == 0 {
pcrs = append(pcrs, s)
}
}
lp := len(pcrs)
sort.Slice(pcrs, func(i, j int) bool {
return pcrs[i].Weight < pcrs[j].Weight
})
for i, s := range pcrs {
for j := i + 1; j < lp; j++ {
s2 := pcrs[j]
cp[j] = commonPrefix(s.Sequence, s2.Sequence)
cs[j] = commonSuffix(s.Sequence, s2.Sequence)
}
var cm map[string]string
var err error
chimera, ok := s.Sequence.GetAttribute("chimera")
if !ok {
cm = map[string]string{}
} else {
cm, err = obiutils.InterfaceToStringMap(chimera)
if err != nil {
log.Fatalf("type of chimera not map[string]string: %T (%v)",
chimera, err)
}
}
ls := s.Sequence.Len()
for k := i + 1; k < lp; k++ {
for l := i + 1; l < lp; l++ {
if k != l && cp[k]+cs[l] == ls {
cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)",
pcrs[k].Sequence.Id(),
pcrs[l].Sequence.Id(),
cp[k])
}
}
}
if len(cm) > 0 {
s.Sequence.SetAttribute("chimera", cm)
}
}
}
for sn, sqs := range samples {
w(sn, sqs)
}
}

View File

@@ -13,23 +13,24 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
"github.com/schollz/progressbar/v3" "github.com/schollz/progressbar/v3"
) )
type Ratio struct { type Ratio struct {
Sample string Sample string
SeqID string SeqID string
status string OriginalStatus string
From int WOriginal int
To int WMutant int
CFrom int COriginal int
CTo int CMutant int
Pos int Pos int
Length int Length int
A int A int
C int C int
G int G int
T int T int
} }
type Edge struct { type Edge struct {
@@ -52,45 +53,21 @@ func makeEdge(father, dist, pos int, from, to byte) Edge {
} }
} }
func abs(x int) int {
if x < 0 {
return -x
}
return x
}
func max(x, y int) int {
if x > y {
return x
}
return y
}
func min(x, y int) int {
if x < y {
return x
}
return y
}
func minMax(x, y int) (int, int) {
if x < y {
return x, y
}
return y, x
}
// It takes a filename and a 2D slice of floats pruduced during graph building, // It takes a filename and a 2D slice of floats pruduced during graph building,
// and writes a CSV file with the first column being the // and writes a CSV file with the first column being the
// first nucleotide, the second column being the second nucleotide, and the third column being the // first nucleotide, the second column being the second nucleotide, and the third column being the
// ratio // ratio
func EmpiricalDistCsv(filename string, data [][]Ratio) { func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
file, err := os.Create(filename) file, err := os.Create(filename)
if err != nil { if err != nil {
fmt.Println(err) fmt.Println(err)
} }
defer file.Close()
destfile, err := obiutils.CompressStream(file, true, true)
if err != nil {
fmt.Println(err)
}
defer destfile.Close()
pbopt := make([]progressbar.Option, 0, 5) pbopt := make([]progressbar.Option, 0, 5)
pbopt = append(pbopt, pbopt = append(pbopt,
@@ -103,19 +80,19 @@ func EmpiricalDistCsv(filename string, data [][]Ratio) {
bar := progressbar.NewOptions(len(data), pbopt...) bar := progressbar.NewOptions(len(data), pbopt...)
fmt.Fprintln(file, "Sample,Father_id,Father_status,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length,A,C,G,T") fmt.Fprintln(destfile, "Sample,Origin_id,Origin_status,Origin,Mutant,Origin_Weight,Mutant_Weight,Origin_Count,Mutant_Count,Position,Origin_length,A,C,G,T")
for code, dist := range data { for code, dist := range data {
a1, a2 := intToNucPair(code) a1, a2 := intToNucPair(code)
for _, ratio := range dist { for _, ratio := range dist {
fmt.Fprintf(file, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n", fmt.Fprintf(destfile, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
ratio.Sample, ratio.Sample,
ratio.SeqID, ratio.SeqID,
ratio.status, ratio.OriginalStatus,
a1, a2, a1, a2,
ratio.From, ratio.WOriginal,
ratio.To, ratio.WMutant,
ratio.CFrom, ratio.COriginal,
ratio.CTo, ratio.CMutant,
ratio.Pos, ratio.Pos,
ratio.Length, ratio.Length,
ratio.A, ratio.A,
@@ -478,16 +455,20 @@ func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
if father.Weight >= minStatRatio && edge.Dist == 1 { if father.Weight >= minStatRatio && edge.Dist == 1 {
s := father.Sequence.Sequence() s := father.Sequence.Sequence()
ratio[edge.NucPair] = append(ratio[edge.NucPair], ratio[edge.NucPair] = append(ratio[edge.NucPair],
Ratio{name, Ratio{
father.Sequence.Id(), Status(father.Sequence)[name], Sample: name,
father.Weight, seq.Weight, SeqID: father.Sequence.Id(),
father.Count, seq.Count, OriginalStatus: Status(father.Sequence)[name],
edge.Pos, WOriginal: father.Weight,
father.Sequence.Len(), WMutant: seq.Weight,
bytes.Count(s, []byte("a")), COriginal: father.Count,
bytes.Count(s, []byte("c")), CMutant: seq.Count,
bytes.Count(s, []byte("g")), Pos: edge.Pos,
bytes.Count(s, []byte("t"))}) Length: father.Sequence.Len(),
A: bytes.Count(s, []byte("a")),
C: bytes.Count(s, []byte("c")),
G: bytes.Count(s, []byte("g")),
T: bytes.Count(s, []byte("t"))})
} }
} }

View File

@@ -2,6 +2,7 @@ package obiclean
import ( import (
"fmt" "fmt"
"maps"
"os" "os"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
@@ -19,6 +20,7 @@ type seqPCR struct {
Sequence *obiseq.BioSequence // pointer to the corresponding sequence Sequence *obiseq.BioSequence // pointer to the corresponding sequence
SonCount int SonCount int
AddedSons int AddedSons int
IsHead bool
Edges []Edge Edges []Edge
Cluster map[int]bool // used as the set of head sequences associated to that sequence Cluster map[int]bool // used as the set of head sequences associated to that sequence
} }
@@ -50,6 +52,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
Sequence: s, Sequence: s,
SonCount: 0, SonCount: 0,
AddedSons: 0, AddedSons: 0,
IsHead: false,
}) })
} }
} }
@@ -57,9 +60,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
return samples return samples
} }
func annotateOBIClean(source string, dataset obiseq.BioSequenceSlice, func annotateOBIClean(source string, dataset obiseq.BioSequenceSlice) obiiter.IBioSequence {
sample map[string]*([]*seqPCR),
tag, NAValue string) obiiter.IBioSequence {
batchsize := 1000 batchsize := 1000
var annot = func(data obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) { var annot = func(data obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
@@ -114,6 +115,28 @@ func IsHead(sequence *obiseq.BioSequence) bool {
return ishead return ishead
} }
func NotAlwaysChimera(tag string) obiseq.SequencePredicate {
descriptor := obiseq.MakeStatsOnDescription(tag)
predicat := func(sequence *obiseq.BioSequence) bool {
chimera, ok := sequence.GetStringMap("chimera")
if !ok || len(chimera) == 0 {
return true
}
samples := maps.Keys(sequence.StatsOn(descriptor, "NA"))
for s := range samples {
if _, ok := chimera[s]; !ok {
return true
}
}
return false
}
return predicat
}
func HeadCount(sequence *obiseq.BioSequence) int { func HeadCount(sequence *obiseq.BioSequence) int {
var err error var err error
annotation := sequence.Annotations() annotation := sequence.Annotations()
@@ -237,6 +260,7 @@ func Mutation(sample map[string]*([]*seqPCR)) {
} }
func Status(sequence *obiseq.BioSequence) map[string]string { func Status(sequence *obiseq.BioSequence) map[string]string {
var err error
annotation := sequence.Annotations() annotation := sequence.Annotations()
iobistatus, ok := annotation["obiclean_status"] iobistatus, ok := annotation["obiclean_status"]
var obistatus map[string]string var obistatus map[string]string
@@ -246,9 +270,9 @@ func Status(sequence *obiseq.BioSequence) map[string]string {
case map[string]string: case map[string]string:
obistatus = iobistatus obistatus = iobistatus
case map[string]interface{}: case map[string]interface{}:
obistatus = make(map[string]string) obistatus, err = obiutils.InterfaceToStringMap(obistatus)
for k, v := range iobistatus { if err != nil {
obistatus[k] = fmt.Sprint(v) log.Panicf("obiclean_status attribute of sequence %s must be castable to a map[string]string", sequence.Id())
} }
} }
} else { } else {
@@ -356,19 +380,30 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
} }
} }
if DetectChimera() {
AnnotateChimera(samples)
}
if SaveGraphToFiles() { if SaveGraphToFiles() {
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate()) SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
} }
if IsSaveRatioTable() { if IsSaveRatioTable() {
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate()) all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
EmpiricalDistCsv(RatioTableFilename(), all_ratio) EmpiricalDistCsv(RatioTableFilename(), all_ratio, obidefault.CompressOutput())
} }
iter := annotateOBIClean(source, db, samples, SampleAttribute(), "NA") iter := annotateOBIClean(source, db)
if OnlyHead() { if OnlyHead() {
iter = iter.FilterOn(IsHead, 1000) iter = iter.FilterOn(IsHead,
obidefault.BatchSize()).FilterOn(NotAlwaysChimera(SampleAttribute()),
obidefault.BatchSize())
}
if MinSampleCount() > 1 {
sc := obiseq.OccurInAtleast(SampleAttribute(), MinSampleCount())
iter = iter.FilterOn(sc, obidefault.BatchSize())
} }
return iter return iter

View File

@@ -16,6 +16,8 @@ var _onlyHead = false
var _saveGraph = "__@@NOSAVE@@__" var _saveGraph = "__@@NOSAVE@@__"
var _saveRatio = "__@@NOSAVE@@__" var _saveRatio = "__@@NOSAVE@@__"
var _minSample = 1
var _detectChimera = false
func ObicleanOptionSet(options *getoptions.GetOpt) { func ObicleanOptionSet(options *getoptions.GetOpt) {
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute, options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
@@ -55,6 +57,13 @@ func ObicleanOptionSet(options *getoptions.GetOpt) {
"The ratio file follows the csv format."), "The ratio file follows the csv format."),
) )
options.IntVar(&_minSample, "min-sample-count", _minSample,
options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
)
options.BoolVar(&_detectChimera, "detect-chimera", _detectChimera,
options.Description("Detect chimera sequences."),
)
} }
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
@@ -111,3 +120,13 @@ func IsSaveRatioTable() bool {
func RatioTableFilename() string { func RatioTableFilename() string {
return _saveRatio return _saveRatio
} }
// It returns the minimum number of samples a sequence must be present in to be considered in the analysis
func MinSampleCount() int {
return _minSample
}
// It returns true if chimera detection is enabled
func DetectChimera() bool {
return _detectChimera
}

View File

@@ -3,6 +3,7 @@ package obiconvert
import ( import (
"os" "os"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
@@ -30,7 +31,6 @@ var __output_fastjson_format__ = false
var __output_fastobi_format__ = false var __output_fastobi_format__ = false
var __no_progress_bar__ = false var __no_progress_bar__ = false
var __compressed__ = false
var __skip_empty__ = false var __skip_empty__ = false
var __output_file_name__ = "-" var __output_file_name__ = "-"
@@ -71,16 +71,16 @@ func InputOptionSet(options *getoptions.GetOpt) {
} }
func OutputModeOptionSet(options *getoptions.GetOpt) { func OutputModeOptionSet(options *getoptions.GetOpt, compressed bool) {
options.BoolVar(&__no_progress_bar__, "no-progressbar", false, options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
options.Description("Disable the progress bar printing")) options.Description("Disable the progress bar printing"))
options.BoolVar(&__compressed__, "compress", false, if compressed {
options.Alias("Z"), options.BoolVar(obidefault.CompressedPtr(), "compressed", obidefault.CompressOutput(),
options.Description("Output is compressed")) options.Alias("Z"),
options.Description("Compress all the result using gzip"))
options.BoolVar(&__skip_empty__, "skip-empty", __skip_empty__, }
options.Description("Sequences of length equal to zero are suppressed from the output"))
options.StringVar(&__output_file_name__, "out", __output_file_name__, options.StringVar(&__output_file_name__, "out", __output_file_name__,
options.Alias("o"), options.Alias("o"),
@@ -90,6 +90,9 @@ func OutputModeOptionSet(options *getoptions.GetOpt) {
} }
func OutputOptionSet(options *getoptions.GetOpt) { func OutputOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__skip_empty__, "skip-empty", __skip_empty__,
options.Description("Sequences of length equal to zero are suppressed from the output"))
options.BoolVar(&__output_in_fasta__, "fasta-output", false, options.BoolVar(&__output_in_fasta__, "fasta-output", false,
options.Description("Write sequence in fasta format (default if no quality data available).")) options.Description("Write sequence in fasta format (default if no quality data available)."))
@@ -105,7 +108,7 @@ func OutputOptionSet(options *getoptions.GetOpt) {
options.Alias("O"), options.Alias("O"),
options.Description("output FASTA/FASTQ title line annotations follow OBI format.")) options.Description("output FASTA/FASTQ title line annotations follow OBI format."))
OutputModeOptionSet(options) OutputModeOptionSet(options, true)
} }
func PairedFilesOptionSet(options *getoptions.GetOpt) { func PairedFilesOptionSet(options *getoptions.GetOpt) {
@@ -159,10 +162,6 @@ func CLIOutputFormat() string {
} }
} }
func CLICompressed() bool {
return __compressed__
}
func CLISkipEmpty() bool { func CLISkipEmpty() bool {
return __skip_empty__ return __skip_empty__
} }

View File

@@ -55,6 +55,8 @@ func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
strings.HasSuffix(path, "fasta.gz") || strings.HasSuffix(path, "fasta.gz") ||
strings.HasSuffix(path, "fastq") || strings.HasSuffix(path, "fastq") ||
strings.HasSuffix(path, "fastq.gz") || strings.HasSuffix(path, "fastq.gz") ||
strings.HasSuffix(path, "fq") ||
strings.HasSuffix(path, "fq.gz") ||
strings.HasSuffix(path, "seq") || strings.HasSuffix(path, "seq") ||
strings.HasSuffix(path, "seq.gz") || strings.HasSuffix(path, "seq.gz") ||
strings.HasSuffix(path, "gb") || strings.HasSuffix(path, "gb") ||
@@ -140,7 +142,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
} }
switch CLIInputFormat() { switch CLIInputFormat() {
case "fastq": case "fastq", "fq":
reader = obiformats.ReadFastqFromFile reader = obiformats.ReadFastqFromFile
case "fasta": case "fasta":
reader = obiformats.ReadFastaFromFile reader = obiformats.ReadFastaFromFile
@@ -168,22 +170,25 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
opts..., opts...,
) )
} else { } else {
iterator, err = reader(list_of_files[0], opts...) if len(list_of_files) > 0 {
iterator, err = reader(list_of_files[0], opts...)
if err != nil {
return obiiter.NilIBioSequence, err
}
if CLIPairedFileName() != "" {
ip, err := reader(CLIPairedFileName(), opts...)
if err != nil { if err != nil {
return obiiter.NilIBioSequence, err return obiiter.NilIBioSequence, err
} }
iterator = iterator.PairTo(ip) if CLIPairedFileName() != "" {
} ip, err := reader(CLIPairedFileName(), opts...)
if err != nil {
return obiiter.NilIBioSequence, err
}
iterator = iterator.PairTo(ip)
}
} else {
iterator = obiiter.NilIBioSequence
}
} }
} }

View File

@@ -21,7 +21,7 @@ func BuildPairedFileNames(filename string) (string, string) {
forward := parts[0] + "_R1" forward := parts[0] + "_R1"
reverse := parts[0] + "_R2" reverse := parts[0] + "_R2"
if parts[1] != "" { if len(parts) > 1 && parts[1] != "" {
suffix := "." + parts[1] suffix := "." + parts[1]
forward += suffix forward += suffix
reverse += suffix reverse += suffix
@@ -58,7 +58,7 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers)) opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
opts = append(opts, obiformats.OptionsBatchSize(obidefault.BatchSize())) opts = append(opts, obiformats.OptionsBatchSize(obidefault.BatchSize()))
opts = append(opts, obiformats.OptionsCompressed(CLICompressed())) opts = append(opts, obiformats.OptionsCompressed(obidefault.CompressOutput()))
var err error var err error

View File

@@ -33,6 +33,7 @@ func CLIWriteSequenceCSV(iterator obiiter.IBioSequence,
CSVDefinition(CLIPrintDefinition()), CSVDefinition(CLIPrintDefinition()),
CSVKeys(CLIToBeKeptAttributes()), CSVKeys(CLIToBeKeptAttributes()),
CSVSequence(CLIPrintSequence()), CSVSequence(CLIPrintSequence()),
CSVQuality(CLIPrintQuality()),
CSVAutoColumn(CLIAutoColumns()), CSVAutoColumn(CLIAutoColumns()),
) )

View File

@@ -66,7 +66,7 @@ func CSVOptionSet(options *getoptions.GetOpt) {
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
obiconvert.InputOptionSet(options) obiconvert.InputOptionSet(options)
obiconvert.OutputModeOptionSet(options) obiconvert.OutputModeOptionSet(options, true)
obioptions.LoadTaxonomyOptionSet(options, false, false) obioptions.LoadTaxonomyOptionSet(options, false, false)
CSVOptionSet(options) CSVOptionSet(options)
} }

View File

@@ -40,7 +40,7 @@ func CSVSequenceHeader(opt Options) obiitercsv.CSVHeader {
} }
if opt.CSVQuality() { if opt.CSVQuality() {
record.AppendField("quality") record.AppendField("qualities")
} }
return record return record
@@ -100,9 +100,9 @@ func CSVBatchFromSequences(batch obiiter.BioSequenceBatch, opt Options) obiiterc
for j := 0; j < l; j++ { for j := 0; j < l; j++ {
ascii[j] = uint8(q[j]) + uint8(quality_shift) ascii[j] = uint8(q[j]) + uint8(quality_shift)
} }
record["quality"] = string(ascii) record["qualities"] = string(ascii)
} else { } else {
record["quality"] = opt.CSVNAValue() record["qualities"] = opt.CSVNAValue()
} }
} }

View File

@@ -33,7 +33,7 @@ func CLIDistributeSequence(sequences obiiter.IBioSequence) {
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers), opts = append(opts, obiformats.OptionsParallelWorkers(nworkers),
obiformats.OptionsBatchSize(obidefault.BatchSize()), obiformats.OptionsBatchSize(obidefault.BatchSize()),
obiformats.OptionsAppendFile(CLIAppendSequences()), obiformats.OptionsAppendFile(CLIAppendSequences()),
obiformats.OptionsCompressed(obiconvert.CLICompressed())) obiformats.OptionsCompressed(obidefault.CompressOutput()))
var formater obiformats.SequenceBatchWriterToFile var formater obiformats.SequenceBatchWriterToFile

View File

@@ -6,6 +6,7 @@ import (
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
@@ -16,6 +17,7 @@ import (
var _BelongTaxa = make([]string, 0) var _BelongTaxa = make([]string, 0)
var _NotBelongTaxa = make([]string, 0) var _NotBelongTaxa = make([]string, 0)
var _RequiredRanks = make([]string, 0) var _RequiredRanks = make([]string, 0)
var _ValidateTaxonomy = false
var _MinimumLength = 1 var _MinimumLength = 1
var _MaximumLength = int(2e9) var _MaximumLength = int(2e9)
@@ -62,6 +64,9 @@ func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
options.ArgName("RANK_NAME"), options.ArgName("RANK_NAME"),
options.Description("Select sequences belonging a taxon with a rank <RANK_NAME>")) options.Description("Select sequences belonging a taxon with a rank <RANK_NAME>"))
options.BoolVar(&_ValidateTaxonomy, "valid-taxid", _ValidateTaxonomy,
options.Description("Validate the taxonomic classification of the sequences."))
} }
func SequenceSelectionOptionSet(options *getoptions.GetOpt) { func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
@@ -248,15 +253,15 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
if len(_BelongTaxa) > 0 { if len(_BelongTaxa) > 0 {
taxonomy := obitax.DefaultTaxonomy() taxonomy := obitax.DefaultTaxonomy()
taxon := taxonomy.Taxon(_BelongTaxa[0]) taxon, _, err := taxonomy.Taxon(_BelongTaxa[0])
if taxon == nil { if err != nil {
p = obiseq.IsSubCladeOfSlot(taxonomy, _BelongTaxa[0]) p = obiseq.IsSubCladeOfSlot(taxonomy, _BelongTaxa[0])
} else { } else {
p = obiseq.IsSubCladeOf(taxonomy, taxon) p = obiseq.IsSubCladeOf(taxonomy, taxon)
} }
for _, staxid := range _BelongTaxa[1:] { for _, staxid := range _BelongTaxa[1:] {
taxon := taxonomy.Taxon(staxid) taxon, _, err := taxonomy.Taxon(staxid)
if taxon == nil { if err != nil {
p2 = obiseq.IsSubCladeOfSlot(taxonomy, staxid) p2 = obiseq.IsSubCladeOfSlot(taxonomy, staxid)
} else { } else {
p2 = obiseq.IsSubCladeOf(taxonomy, taxon) p2 = obiseq.IsSubCladeOf(taxonomy, taxon)
@@ -271,6 +276,27 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
return nil return nil
} }
func CLIIsValidTaxonomyPredicate() obiseq.SequencePredicate {
if _ValidateTaxonomy {
if !obidefault.HasSelectedTaxonomy() {
log.Fatal("Taxonomy not found")
}
taxonomy := obitax.DefaultTaxonomy()
if taxonomy == nil {
log.Fatal("Taxonomy not found")
}
predicat := func(sequences *obiseq.BioSequence) bool {
taxon := sequences.Taxon(taxonomy)
return taxon != nil
}
return predicat
}
return nil
}
func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate { func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
var p obiseq.SequencePredicate var p obiseq.SequencePredicate
var p2 obiseq.SequencePredicate var p2 obiseq.SequencePredicate
@@ -278,16 +304,16 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
if len(_NotBelongTaxa) > 0 { if len(_NotBelongTaxa) > 0 {
taxonomy := obitax.DefaultTaxonomy() taxonomy := obitax.DefaultTaxonomy()
taxon := taxonomy.Taxon(_NotBelongTaxa[0]) taxon, _, err := taxonomy.Taxon(_NotBelongTaxa[0])
if taxon == nil { if err != nil {
p = obiseq.IsSubCladeOfSlot(taxonomy, _NotBelongTaxa[0]) p = obiseq.IsSubCladeOfSlot(taxonomy, _NotBelongTaxa[0])
} else { } else {
p = obiseq.IsSubCladeOf(taxonomy, taxon) p = obiseq.IsSubCladeOf(taxonomy, taxon)
} }
for _, taxid := range _NotBelongTaxa[1:] { for _, taxid := range _NotBelongTaxa[1:] {
taxon := taxonomy.Taxon(taxid) taxon, _, err := taxonomy.Taxon(taxid)
if taxon == nil { if err != nil {
p2 = obiseq.IsSubCladeOfSlot(taxonomy, taxid) p2 = obiseq.IsSubCladeOfSlot(taxonomy, taxid)
} else { } else {
p2 = obiseq.IsSubCladeOf(taxonomy, taxon) p2 = obiseq.IsSubCladeOf(taxonomy, taxon)
@@ -319,7 +345,7 @@ func CLIHasRankDefinedPredicate() obiseq.SequencePredicate {
} }
func CLITaxonomyFilterPredicate() obiseq.SequencePredicate { func CLITaxonomyFilterPredicate() obiseq.SequencePredicate {
return CLIHasRankDefinedPredicate().And(CLIRestrictTaxonomyPredicate()).And(CLIAvoidTaxonomyPredicate()) return CLIIsValidTaxonomyPredicate().And(CLIAvoidTaxonomyPredicate()).And(CLIHasRankDefinedPredicate()).And(CLIRestrictTaxonomyPredicate())
} }
func CLIPredicatesPredicate() obiseq.SequencePredicate { func CLIPredicatesPredicate() obiseq.SequencePredicate {

View File

@@ -129,6 +129,7 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
} }
lcons := cons.Len() lcons := cons.Len()
aliLength := lcons - _Abs(left) - _Abs(right) aliLength := lcons - _Abs(left) - _Abs(right)
identity := float64(match) / float64(aliLength) identity := float64(match) / float64(aliLength)
if aliLength == 0 { if aliLength == 0 {
identity = 0 identity = 0
@@ -237,7 +238,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
log.Printf("End of the sequence Pairing") log.Printf("End of the sequence Pairing")
}() }()
f := func(iterator obiiter.IBioSequence, wid int) { f := func(iterator obiiter.IBioSequence) {
arena := obialign.MakePEAlignArena(150, 150) arena := obialign.MakePEAlignArena(150, 150)
shifts := make(map[int]int) shifts := make(map[int]int)
@@ -262,9 +263,9 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
log.Printf("Start of the sequence Pairing using %d workers\n", nworkers) log.Printf("Start of the sequence Pairing using %d workers\n", nworkers)
for i := 0; i < nworkers-1; i++ { for i := 0; i < nworkers-1; i++ {
go f(iterator.Split(), i) go f(iterator.Split())
} }
go f(iterator, nworkers-1) go f(iterator)
return newIter return newIter
} }

View File

@@ -42,9 +42,10 @@ func MatchDistanceIndex(taxonomy *obitax.Taxonomy, distance int, distanceIdx map
if i == len(keys) || distance > keys[len(keys)-1] { if i == len(keys) || distance > keys[len(keys)-1] {
taxon = taxonomy.Root() taxon = taxonomy.Root()
} else { } else {
taxon = taxonomy.Taxon(distanceIdx[keys[i]]) var err error
if taxon == nil { taxon, _, err = taxonomy.Taxon(distanceIdx[keys[i]])
log.Panicf("Cannot identify taxon %s in %s", distanceIdx[keys[i]], taxonomy.Name()) if err != nil {
log.Panicf("Cannot identify taxon %s in %s (%v)", distanceIdx[keys[i]], taxonomy.Name(), err)
} }
} }
@@ -72,6 +73,10 @@ func FindClosests(sequence *obiseq.BioSequence,
refcounts []*obikmer.Table4mer, refcounts []*obikmer.Table4mer,
runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) { runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) {
if sequence.Len() < 5 {
return obiseq.BioSequenceSlice{}, 1000, 0, "NA", []int{}
}
var matrix []uint64 var matrix []uint64
seqwords := obikmer.Count4Mer(sequence, nil, nil) seqwords := obikmer.Count4Mer(sequence, nil, nil)
@@ -196,9 +201,9 @@ func Identify(sequence *obiseq.BioSequence,
log.Panic("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d) log.Panic("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d)
} }
match_taxon := taxo.Taxon(identification) match_taxon, _, err := taxo.Taxon(identification)
if taxon != nil { if err == nil {
taxon, _ = taxon.LCA(match_taxon) taxon, _ = taxon.LCA(match_taxon)
} else { } else {
taxon = match_taxon taxon = match_taxon
@@ -255,7 +260,7 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
if taxon != nil { if taxon != nil {
j++ j++
} else { } else {
log.Warnf("Taxid %d is not described in the taxonomy %s."+ log.Warnf("Taxid %s is not described in the taxonomy %s."+
" Sequence %s is discared from the reference database", " Sequence %s is discared from the reference database",
seq.Taxid(), taxo.Name(), seq.Id()) seq.Taxid(), taxo.Name(), seq.Id())
} }

View File

@@ -6,7 +6,6 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
@@ -43,7 +42,6 @@ func TagOptionSet(options *getoptions.GetOpt) {
// the obiuniq command // the obiuniq command
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
obiconvert.OptionSet(options) obiconvert.OptionSet(options)
obioptions.LoadTaxonomyOptionSet(options, true, false)
TagOptionSet(options) TagOptionSet(options)
} }

View File

@@ -1,10 +1,15 @@
package obitaxonomy package obitaxonomy
import ( import (
"fmt"
"time"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
) )
@@ -73,3 +78,18 @@ func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obiitercsv.ICSVRecord {
func CLICSVTaxaWriter(iterator *obitax.ITaxon, terminalAction bool) *obiitercsv.ICSVRecord { func CLICSVTaxaWriter(iterator *obitax.ITaxon, terminalAction bool) *obiitercsv.ICSVRecord {
return obicsv.CLICSVWriter(CLICSVTaxaIterator(iterator), terminalAction) return obicsv.CLICSVWriter(CLICSVTaxaIterator(iterator), terminalAction)
} }
func CLIDownloadNCBITaxdump() error {
now := time.Now()
dateStr := now.Format("20060102") // In Go, this specific date is used as reference for formatting
filename := fmt.Sprintf("ncbitaxo_%s.tgz", dateStr)
if obiconvert.CLIOutPutFileName() != "-" {
filename = obiconvert.CLIOutPutFileName()
}
log.Infof("Downloading NCBI Taxdump to %s", filename)
return obiutils.DownloadFile("https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz", filename)
}

View File

@@ -5,6 +5,7 @@ import (
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax" "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
"github.com/DavidGamba/go-getoptions" "github.com/DavidGamba/go-getoptions"
) )
@@ -22,6 +23,8 @@ var __taxid_path__ = "NA"
var __taxid_sons__ = "NA" var __taxid_sons__ = "NA"
var __restrict_rank__ = "" var __restrict_rank__ = ""
var __to_dump__ = "" var __to_dump__ = ""
var __download_ncbi__ = false
var __extract_taxonomy__ = false
func FilterTaxonomyOptionSet(options *getoptions.GetOpt) { func FilterTaxonomyOptionSet(options *getoptions.GetOpt) {
options.BoolVar(&__rank_list__, "rank-list", false, options.BoolVar(&__rank_list__, "rank-list", false,
@@ -34,7 +37,8 @@ func FilterTaxonomyOptionSet(options *getoptions.GetOpt) {
} }
func OptionSet(options *getoptions.GetOpt) { func OptionSet(options *getoptions.GetOpt) {
obioptions.LoadTaxonomyOptionSet(options, true, true) obioptions.LoadTaxonomyOptionSet(options, false, true)
obiconvert.OutputModeOptionSet(options, false)
FilterTaxonomyOptionSet(options) FilterTaxonomyOptionSet(options)
options.BoolVar(&__fixed_pattern__, "fixed", false, options.BoolVar(&__fixed_pattern__, "fixed", false,
options.Alias("F"), options.Alias("F"),
@@ -70,6 +74,12 @@ func OptionSet(options *getoptions.GetOpt) {
options.ArgName("TAXID"), options.ArgName("TAXID"),
options.Description("Dump a sub-taxonomy corresponding to the precised clade"), options.Description("Dump a sub-taxonomy corresponding to the precised clade"),
) )
options.BoolVar(&__download_ncbi__, "download-ncbi", __download_ncbi__,
options.Description("Download the current NCBI taxonomy taxdump"),
)
options.BoolVar(&__extract_taxonomy__, "extract-taxonomy", __extract_taxonomy__,
options.Description("Extract taxonomy from a sequence file"),
)
} }
func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) { func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) {
@@ -81,13 +91,14 @@ func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) {
ts := taxonomy.NewTaxonSet() ts := taxonomy.NewTaxonSet()
for _, taxid := range __taxonomical_restriction__ { for _, taxid := range __taxonomical_restriction__ {
tx := taxonomy.Taxon(taxid) tx, _, err := taxonomy.Taxon(taxid)
if tx == nil { if err != nil {
return nil, fmt.Errorf( return nil, fmt.Errorf(
"cannot find taxon %s in taxonomy %s", "cannot find taxon %s in taxonomy %s (%v)",
taxid, taxid,
taxonomy.Name(), taxonomy.Name(),
err,
) )
} }
@@ -144,3 +155,11 @@ func CLIDumpSubtaxonomy() bool {
func CLISubTaxonomyNode() string { func CLISubTaxonomyNode() string {
return __to_dump__ return __to_dump__
} }
func CLIDownloadNCBI() bool {
return __download_ncbi__
}
func CLIExtractTaxonomy() bool {
return __extract_taxonomy__
}

View File

@@ -93,3 +93,145 @@ func MapToMapInterface(m interface{}) map[string]interface{} {
log.Panic("Invalid map type") log.Panic("Invalid map type")
return make(map[string]interface{}) return make(map[string]interface{})
} }
// InterfaceToInt converts a interface{} to an integer value if possible.
// If not a "NotAnInteger" error is returned via the err
// return value and val is set to 0.
func InterfaceToInt(i interface{}) (val int, err error) {
err = nil
val = 0
switch t := i.(type) {
case int:
val = t
case int8:
val = int(t) // standardizes across systems
case int16:
val = int(t) // standardizes across systems
case int32:
val = int(t) // standardizes across systems
case int64:
val = int(t) // standardizes across systems
case float32:
val = int(t) // standardizes across systems
case float64:
val = int(t) // standardizes across systems
case uint8:
val = int(t) // standardizes across systems
case uint16:
val = int(t) // standardizes across systems
case uint32:
val = int(t) // standardizes across systems
case uint64:
val = int(t) // standardizes across systems
default:
err = &NotAnInteger{"value attribute cannot be casted to an integer"}
}
return
}
// InterfaceToInt converts a interface{} to an integer value if possible.
// If not a "NotAnInteger" error is returned via the err
// return value and val is set to 0.
func InterfaceToFloat64(i interface{}) (val float64, err error) {
err = nil
val = 0
switch t := i.(type) {
case int:
val = float64(t)
case int8:
val = float64(t) // standardizes across systems
case int16:
val = float64(t) // standardizes across systems
case int32:
val = float64(t) // standardizes across systems
case int64:
val = float64(t) // standardizes across systems
case float32:
val = float64(t) // standardizes across systems
case float64:
val = t // standardizes across systems
case uint8:
val = float64(t) // standardizes across systems
case uint16:
val = float64(t) // standardizes across systems
case uint32:
val = float64(t) // standardizes across systems
case uint64:
val = float64(t) // standardizes across systems
default:
err = &NotAnFloat64{"value attribute cannot be casted to a float value"}
}
return
}
func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
err = nil
switch i := i.(type) {
case map[string]int:
val = i
case map[string]interface{}:
val = make(map[string]int, len(i))
for k, v := range i {
val[k], err = InterfaceToInt(v)
if err != nil {
return
}
}
case map[string]float64:
val = make(map[string]int, len(i))
for k, v := range i {
val[k] = int(v)
}
default:
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
}
return
}
func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
err = nil
switch i := i.(type) {
case map[string]string:
val = i
case map[string]interface{}:
val = make(map[string]string, len(i))
for k, v := range i {
val[k], err = InterfaceToString(v)
if err != nil {
return
}
}
default:
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
}
return
}
func InterfaceToStringSlice(i interface{}) (val []string, err error) {
err = nil
switch i := i.(type) {
case []string:
val = i
case []interface{}:
val = make([]string, len(i))
for k, v := range i {
val[k], err = InterfaceToString(v)
if err != nil {
return
}
}
default:
err = &NotAMapInt{"value attribute cannot be casted to a []string"}
}
return
}

45
pkg/obiutils/download.go Normal file
View File

@@ -0,0 +1,45 @@
package obiutils
import (
"fmt"
"io"
"net/http"
"os"
"github.com/schollz/progressbar/v3"
)
func DownloadFile(url string, filepath string) error {
// Get the data
resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
// Check server response
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("bad status: %s", resp.Status)
}
// Create the file
out, err := os.Create(filepath)
if err != nil {
return err
}
defer out.Close()
// Create progress bar
bar := progressbar.DefaultBytes(
resp.ContentLength,
"downloading",
)
// Write the body to file while updating the progress bar
_, err = io.Copy(io.MultiWriter(out, bar), resp.Body)
if err != nil {
return err
}
return nil
}

View File

@@ -25,43 +25,6 @@ func (m *NotAnInteger) Error() string {
return m.message return m.message
} }
// InterfaceToInt converts a interface{} to an integer value if possible.
// If not a "NotAnInteger" error is returned via the err
// return value and val is set to 0.
func InterfaceToInt(i interface{}) (val int, err error) {
err = nil
val = 0
switch t := i.(type) {
case int:
val = t
case int8:
val = int(t) // standardizes across systems
case int16:
val = int(t) // standardizes across systems
case int32:
val = int(t) // standardizes across systems
case int64:
val = int(t) // standardizes across systems
case float32:
val = int(t) // standardizes across systems
case float64:
val = int(t) // standardizes across systems
case uint8:
val = int(t) // standardizes across systems
case uint16:
val = int(t) // standardizes across systems
case uint32:
val = int(t) // standardizes across systems
case uint64:
val = int(t) // standardizes across systems
default:
err = &NotAnInteger{"value attribute cannot be casted to an integer"}
}
return
}
// NotAnInteger defines a new type of Error : "NotAnInteger" // NotAnInteger defines a new type of Error : "NotAnInteger"
type NotAnFloat64 struct { type NotAnFloat64 struct {
message string message string
@@ -74,43 +37,6 @@ func (m *NotAnFloat64) Error() string {
return m.message return m.message
} }
// InterfaceToInt converts a interface{} to an integer value if possible.
// If not a "NotAnInteger" error is returned via the err
// return value and val is set to 0.
func InterfaceToFloat64(i interface{}) (val float64, err error) {
err = nil
val = 0
switch t := i.(type) {
case int:
val = float64(t)
case int8:
val = float64(t) // standardizes across systems
case int16:
val = float64(t) // standardizes across systems
case int32:
val = float64(t) // standardizes across systems
case int64:
val = float64(t) // standardizes across systems
case float32:
val = float64(t) // standardizes across systems
case float64:
val = t // standardizes across systems
case uint8:
val = float64(t) // standardizes across systems
case uint16:
val = float64(t) // standardizes across systems
case uint32:
val = float64(t) // standardizes across systems
case uint64:
val = float64(t) // standardizes across systems
default:
err = &NotAnFloat64{"value attribute cannot be casted to a float value"}
}
return
}
// NotABoolean defines a new type of Error : "NotAMapInt" // NotABoolean defines a new type of Error : "NotAMapInt"
type NotAMapInt struct { type NotAMapInt struct {
message string message string
@@ -123,53 +49,6 @@ func (m *NotAMapInt) Error() string {
return m.message return m.message
} }
func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
err = nil
switch i := i.(type) {
case map[string]int:
val = i
case map[string]interface{}:
val = make(map[string]int, len(i))
for k, v := range i {
val[k], err = InterfaceToInt(v)
if err != nil {
return
}
}
case map[string]float64:
val = make(map[string]int, len(i))
for k, v := range i {
val[k] = int(v)
}
default:
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
}
return
}
func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
err = nil
switch i := i.(type) {
case map[string]string:
val = i
case map[string]interface{}:
val = make(map[string]string, len(i))
for k, v := range i {
val[k], err = InterfaceToString(v)
if err != nil {
return
}
}
default:
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
}
return
}
// NotABoolean defines a new type of Error : "NotAMapInt" // NotABoolean defines a new type of Error : "NotAMapInt"
type NotAMapFloat64 struct { type NotAMapFloat64 struct {
message string message string

View File

@@ -23,7 +23,7 @@ func MakeSet[E comparable](vals ...E) Set[E] {
// It takes a variadic parameter of type E, where E is a comparable type. // It takes a variadic parameter of type E, where E is a comparable type.
// It returns a pointer to a Set of type E. // It returns a pointer to a Set of type E.
func NewSet[E comparable](vals ...E) *Set[E] { func NewSet[E comparable](vals ...E) *Set[E] {
s := MakeSet[E](vals...) s := MakeSet(vals...)
return &s return &s
} }

View File

@@ -50,7 +50,7 @@ func TestNewSet(t *testing.T) {
} }
// Test Case 2: Creating a set with multiple values // Test Case 2: Creating a set with multiple values
set2 := NewSet[string]("apple", "banana", "cherry") set2 := NewSet("apple", "banana", "cherry")
if len(*set2) != 3 { if len(*set2) != 3 {
t.Errorf("Expected size to be 3, but got %d", len(*set2)) t.Errorf("Expected size to be 3, but got %d", len(*set2))
} }
@@ -147,7 +147,7 @@ func TestMembers(t *testing.T) {
} }
// Test case 2: Set with multiple elements // Test case 2: Set with multiple elements
set = MakeSet[int](1, 2, 3) set = MakeSet(1, 2, 3)
expected = []int{1, 2, 3} expected = []int{1, 2, 3}
actual = set.Members() actual = set.Members()
sort.Ints(actual) sort.Ints(actual)
@@ -172,7 +172,7 @@ func TestSetString(t *testing.T) {
} }
// Test set with single member // Test set with single member
singleMemberSet := NewSet[int](42) singleMemberSet := NewSet(42)
singleMemberSetString := singleMemberSet.String() singleMemberSetString := singleMemberSet.String()
expectedSingleMemberSetString := "[42]" expectedSingleMemberSetString := "[42]"
if singleMemberSetString != expectedSingleMemberSetString { if singleMemberSetString != expectedSingleMemberSetString {
@@ -180,7 +180,7 @@ func TestSetString(t *testing.T) {
} }
// Test set with multiple members // Test set with multiple members
multipleMembersSet := NewSet[int](1, 2, 3) multipleMembersSet := NewSet(1, 2, 3)
multipleMembersSetString := multipleMembersSet.String() multipleMembersSetString := multipleMembersSet.String()
expectedMultipleMembersSetString := "[1 2 3]" expectedMultipleMembersSetString := "[1 2 3]"
if multipleMembersSetString != expectedMultipleMembersSetString { if multipleMembersSetString != expectedMultipleMembersSetString {
@@ -213,26 +213,26 @@ func TestUnion(t *testing.T) {
// Test case 2: Union of an empty set and a non-empty set should return the non-empty set // Test case 2: Union of an empty set and a non-empty set should return the non-empty set
set1 = MakeSet[int]() set1 = MakeSet[int]()
set2 = MakeSet[int](1, 2, 3) set2 = MakeSet(1, 2, 3)
expected = MakeSet[int](1, 2, 3) expected = MakeSet(1, 2, 3)
result = set1.Union(set2) result = set1.Union(set2)
if !reflect.DeepEqual(result, expected) { if !reflect.DeepEqual(result, expected) {
t.Errorf("Expected %v, but got %v", expected, result) t.Errorf("Expected %v, but got %v", expected, result)
} }
// Test case 3: Union of two non-empty sets with common elements should return a set with unique elements // Test case 3: Union of two non-empty sets with common elements should return a set with unique elements
set1 = MakeSet[int](1, 2, 3) set1 = MakeSet(1, 2, 3)
set2 = MakeSet[int](2, 3, 4) set2 = MakeSet(2, 3, 4)
expected = MakeSet[int](1, 2, 3, 4) expected = MakeSet(1, 2, 3, 4)
result = set1.Union(set2) result = set1.Union(set2)
if !reflect.DeepEqual(result, expected) { if !reflect.DeepEqual(result, expected) {
t.Errorf("Expected %v, but got %v", expected, result) t.Errorf("Expected %v, but got %v", expected, result)
} }
// Test case 4: Union of two non-empty sets with no common elements should return a set with all elements // Test case 4: Union of two non-empty sets with no common elements should return a set with all elements
set1 = MakeSet[int](1, 2, 3) set1 = MakeSet(1, 2, 3)
set2 = MakeSet[int](4, 5, 6) set2 = MakeSet(4, 5, 6)
expected = MakeSet[int](1, 2, 3, 4, 5, 6) expected = MakeSet(1, 2, 3, 4, 5, 6)
result = set1.Union(set2) result = set1.Union(set2)
if !reflect.DeepEqual(result, expected) { if !reflect.DeepEqual(result, expected) {
t.Errorf("Expected %v, but got %v", expected, result) t.Errorf("Expected %v, but got %v", expected, result)

4462
xxx.csv

File diff suppressed because it is too large Load Diff