mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-26 22:00:52 +00:00
Compare commits
46 Commits
taxonomy
...
Release_4.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
996ec69bd9 | ||
|
|
5f9182d25b | ||
|
|
9913fa8354 | ||
|
|
7b23314651 | ||
|
|
1e541eac4c | ||
|
|
13cd4c86ac | ||
|
|
75dd535201 | ||
|
|
573acafafc | ||
|
|
0067152c2b | ||
|
|
791d253edc | ||
|
|
6245d7f684 | ||
|
|
13d610aff7 | ||
|
|
db284f1d44 | ||
|
|
51b3e83d32 | ||
|
|
8671285d02 | ||
|
|
51d11aa36d | ||
|
|
fb6f857d8c | ||
|
|
d4209b4549 | ||
|
|
ef05d4975f | ||
|
|
4588bf8b5d | ||
|
|
090633850d | ||
|
|
15a058cf63 | ||
|
|
2f5f7634d6 | ||
|
|
48138b605c | ||
|
|
aed22c12a6 | ||
|
|
443a9b3ce3 | ||
|
|
7e90537379 | ||
|
|
d3d15acc6c | ||
|
|
bd4a0b5ca5 | ||
|
|
952f85f312 | ||
|
|
4774438644 | ||
|
|
6a8061cc4f | ||
|
|
e2563cd8df | ||
|
|
f2e81adf95 | ||
|
|
f27e9bc91e | ||
|
|
773e54965d | ||
|
|
ceca33998b | ||
|
|
b9bee5f426 | ||
|
|
c10df073a7 | ||
|
|
d3dac1b21f | ||
|
|
0df082da06 | ||
|
|
2452aef7a9 | ||
|
|
337954592d | ||
|
|
8a28c9ae7c | ||
|
|
b6b18c0fa1 | ||
|
|
67e2758d63 |
19
.github/workflows/obitest.yml
vendored
Normal file
19
.github/workflows/obitest.yml
vendored
Normal file
@@ -0,0 +1,19 @@
|
||||
name: "Run the obitools command test suite"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- V*
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: '1.23'
|
||||
- name: Checkout obitools4 project
|
||||
uses: actions/checkout@v4
|
||||
- name: Run tests
|
||||
run: make githubtests
|
||||
153
.gitignore
vendored
153
.gitignore
vendored
@@ -1,134 +1,27 @@
|
||||
cpu.pprof
|
||||
cpu.trace
|
||||
test
|
||||
bin
|
||||
vendor
|
||||
*.fastq
|
||||
*.fasta
|
||||
*.fastq.gz
|
||||
*.fasta.gz
|
||||
.DS_Store
|
||||
*.gml
|
||||
*.log
|
||||
/argaly
|
||||
**/cpu.pprof
|
||||
**/cpu.trace
|
||||
**/test
|
||||
**/bin
|
||||
**/vendor
|
||||
**/*.fastq
|
||||
**/*.fasta
|
||||
**/*.fastq.gz
|
||||
**/*.fasta.gz
|
||||
**/.DS_Store
|
||||
**/*.gml
|
||||
**/*.log
|
||||
**/xxx*
|
||||
**/*.sav
|
||||
**/*.old
|
||||
**/*.tgz
|
||||
**/*.yaml
|
||||
**/*.csv
|
||||
|
||||
/obiconvert
|
||||
/obicount
|
||||
/obimultiplex
|
||||
/obipairing
|
||||
/obipcr
|
||||
/obifind
|
||||
/obidistribute
|
||||
/obiuniq
|
||||
.rhistory
|
||||
/.vscode
|
||||
/build
|
||||
/Makefile.old
|
||||
.Rproj.user
|
||||
obitools.Rproj
|
||||
Stat_error.knit.md
|
||||
.Rhistory
|
||||
Stat_error.nb.html
|
||||
Stat_error.Rmd
|
||||
|
||||
/.luarc.json
|
||||
/doc/TAXO/
|
||||
/doc/results/
|
||||
/doc/_main.log
|
||||
/doc/_book/_main.tex
|
||||
/doc/_freeze/
|
||||
/doc/tutorial_files/
|
||||
/doc/wolf_data/
|
||||
/taxdump/
|
||||
/.vscode/
|
||||
/ncbitaxo
|
||||
|
||||
/Algo-Alignement.numbers
|
||||
/Estimate_proba_true_seq.html
|
||||
/Estimate_proba_true_seq.nb.html
|
||||
/Estimate_proba_true_seq.Rmd
|
||||
/modele_error_euka.qmd
|
||||
/obitools.code-workspace
|
||||
.DS_Store
|
||||
.RData
|
||||
x
|
||||
xxx
|
||||
y
|
||||
/doc/wolf_diet.tgz
|
||||
/doc/man/depends
|
||||
/sample/wolf_R1.fasta.gz
|
||||
/sample/wolf_R2.fasta.gz
|
||||
/sample/euka03.ecotag.fasta.gz
|
||||
/sample/ratio.csv
|
||||
/sample/STD_PLN_1.dat
|
||||
/sample/STD_PLN_2.dat
|
||||
/sample/subset_Pasvik_R1.fastq.gz
|
||||
/sample/subset_Pasvik_R2.fastq.gz
|
||||
/sample/test_gobitools.fasta.bz2
|
||||
euka03.csv*
|
||||
gbbct793.seq.gz
|
||||
gbinv1003.seq.gz
|
||||
gbpln210.seq
|
||||
/doc/book/OBITools-V4.aux
|
||||
/doc/book/OBITools-V4.fdb_latexmk
|
||||
/doc/book/OBITools-V4.fls
|
||||
/doc/book/OBITools-V4.log
|
||||
/doc/book/OBITools-V4.pdf
|
||||
/doc/book/OBITools-V4.synctex.gz
|
||||
/doc/book/OBITools-V4.tex
|
||||
/doc/book/OBITools-V4.toc
|
||||
getoptions.adoc
|
||||
Archive.zip
|
||||
.DS_Store
|
||||
sample/.DS_Store
|
||||
sample/consensus_graphs/specimen_hac_plants_Vern_disicolor_.gml
|
||||
93954
|
||||
Bact03.e5.gb_R254.obipcr.idx.fasta.save
|
||||
sample/test.obipcr.log
|
||||
Bact02.e3.gb_R254.obipcr.fasta.gz
|
||||
Example_Arth03.ngsfilter
|
||||
SPER01.csv
|
||||
SPER03.csv
|
||||
wolf_diet_ngsfilter.txt
|
||||
xx
|
||||
xxx.gb
|
||||
yyy_geom.csv
|
||||
yyy_LCS.csv
|
||||
yyy.json
|
||||
bug_obimultiplex/toto
|
||||
bug_obimultiplex/toto_mapping
|
||||
bug_obimultiplex/tutu
|
||||
bug_obimultiplex/tutu_mapping
|
||||
bug_obipairing/GIT1_GH_ngsfilter.txt
|
||||
doc/book/TAXO/citations.dmp
|
||||
doc/book/TAXO/delnodes.dmp
|
||||
doc/book/TAXO/division.dmp
|
||||
doc/book/TAXO/gc.prt
|
||||
doc/book/TAXO/gencode.dmp
|
||||
doc/book/TAXO/merged.dmp
|
||||
doc/book/TAXO/names.dmp
|
||||
doc/book/TAXO/nodes.dmp
|
||||
doc/book/TAXO/readme.txt
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/citations.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/delnodes.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/division.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/gc.prt
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/gencode.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/merged.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/names.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/nodes.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/readme.txt
|
||||
doc/book/results/toto.tasta
|
||||
sample/.DS_Store
|
||||
GO
|
||||
ncbitaxo/citations.dmp
|
||||
ncbitaxo/delnodes.dmp
|
||||
ncbitaxo/division.dmp
|
||||
ncbitaxo/gc.prt
|
||||
ncbitaxo/gencode.dmp
|
||||
ncbitaxo/merged.dmp
|
||||
ncbitaxo/names.dmp
|
||||
ncbitaxo/nodes.dmp
|
||||
ncbitaxo/readme.txt
|
||||
template.16S
|
||||
xxx.gz
|
||||
*.sav
|
||||
*.old
|
||||
ncbitaxo.tgz
|
||||
!/obitests/**
|
||||
!/sample/**
|
||||
|
||||
9
Makefile
9
Makefile
@@ -63,6 +63,13 @@ update-deps:
|
||||
|
||||
test:
|
||||
$(GOTEST) ./...
|
||||
|
||||
obitests:
|
||||
@for t in $$(find obitests -name test.sh -print) ; do \
|
||||
bash $${t} ;\
|
||||
done
|
||||
|
||||
githubtests: obitools obitests
|
||||
|
||||
man:
|
||||
make -C doc man
|
||||
@@ -97,5 +104,5 @@ ifneq ($(strip $(COMMIT_ID)),)
|
||||
@rm -f $(OUTPUT)
|
||||
endif
|
||||
|
||||
.PHONY: all packages obitools man obibook doc update-deps .FORCE
|
||||
.PHONY: all packages obitools man obibook doc update-deps obitests githubtests .FORCE
|
||||
.FORCE:
|
||||
@@ -37,7 +37,7 @@ curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install
|
||||
bash -s -- --install-dir test_install --obitools-prefix k
|
||||
```
|
||||
|
||||
In this case, the binaries will be installed in the `test_install` directory and all command names will be prefixed with the letter `k`. Thus `obigrep` will be named `kobigrep`.
|
||||
In this case, the binaries will be installed in the `test_install` directory and all command names will be prefixed with the letter `k`. Thus, `obigrep` will be named `kobigrep`.
|
||||
|
||||
## Continuing the analysis...
|
||||
|
||||
|
||||
251
Release-notes.md
251
Release-notes.md
@@ -1,19 +1,34 @@
|
||||
# OBITools release notes
|
||||
|
||||
## Latest changes
|
||||
## March 1st, 2025. Release 4.4.0
|
||||
|
||||
A new documentation website is available at https://obitools4.metabarcoding.org.
|
||||
Its development is still in progress.
|
||||
|
||||
The biggest step forward in this new version is taxonomy management.
|
||||
The new version is now able to handle taxonomic identifiers that are not just integer values. This is a first step towards an easy way to handle other taxonomy databases soon, such as the GBIF or Catalogue of Life taxonomies.
|
||||
This version is able to handle files containing taxonomic information created by previous versions of OBITools, but files created by this new version may have some problems to be analysed by previous versions, at least for the taxonomic information.
|
||||
|
||||
|
||||
### Breaking changes
|
||||
|
||||
- In `obimultiplex`, the short version of the **--tag-list** option used to specify the list
|
||||
of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`.
|
||||
- In `obimultiplex`, the short version of the **--tag-list** option used to
|
||||
specify the list of tags and primers to be used for the demultiplexing has
|
||||
been changed from `-t` to `-s`.
|
||||
|
||||
- The command `obifind` is now renamed `obitaxonomy`.
|
||||
|
||||
- The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy
|
||||
has been renamed to **--taxonomy**.
|
||||
- The **--taxdump** option used to specify the path to the taxdump containing
|
||||
the NCBI taxonomy has been renamed to **--taxonomy**.
|
||||
|
||||
### Bug fixes
|
||||
|
||||
- Correction of a bug when using paired sequence file with the **--out** option.
|
||||
|
||||
- Correction of a bug in `obitag` when trying to annotate very short sequences of
|
||||
4 bases or less.
|
||||
|
||||
|
||||
- In `obipairing`, correct the stats `seq_a_single` and `seq_b_single` when
|
||||
on right alignment mode
|
||||
|
||||
@@ -21,12 +36,32 @@
|
||||
the batch size and not reading the qualities from the fastq files as `obiuniq`
|
||||
is producing only fasta output without qualities.
|
||||
|
||||
- In `obitag`, correct the wrong assignment of the **obitag_bestmatch**
|
||||
attribute.
|
||||
|
||||
- In `obiclean`, the **--no-progress-bar** option disables all progress bars,
|
||||
not just the data.
|
||||
|
||||
- Several fixes in reading FASTA and FASTQ files, including some code
|
||||
simplification and factorization.
|
||||
|
||||
- Fixed a bug in all obitools that caused the same file to be processed
|
||||
multiple times, when specifying a directory name as input.
|
||||
|
||||
|
||||
### New features
|
||||
|
||||
- `obigrep` add a new **--valid-taxid** option to keep only sequence with a
|
||||
valid taxid
|
||||
|
||||
- `obiclean` add a new **--min-sample-count** option with a default value of 1,
|
||||
asking to filter out sequences which are not occurring in at least the
|
||||
specified number of samples.
|
||||
|
||||
- `obitoaxonomy` a new **--dump|D** option allows for dumping a sub-taxonomy.
|
||||
|
||||
- Taxonomy dump can now be provided as a four-columns CSV file to the **--taxonomy**
|
||||
option.
|
||||
- Taxonomy dump can now be provided as a four-columns CSV file to the
|
||||
**--taxonomy** option.
|
||||
|
||||
- NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The
|
||||
path of the tar and gziped dump file can be directly specified using the
|
||||
@@ -37,54 +72,68 @@
|
||||
allow the processing of the rare fasta and fastq files not recognized.
|
||||
|
||||
- In `obiscript`, adds new methods to the Lua sequence object:
|
||||
- `md5_string()`: returning the MD5 check sum as an hexadecimal string,
|
||||
- `subsequence(from,to)`: allows to extract a subsequence on a 0 based
|
||||
coordinate system, upper bound expluded like in go.
|
||||
- `reverse_complement`: returning a sequence object corresponding to the reverse complement
|
||||
of the current sequence.
|
||||
- `md5_string()`: returning the MD5 check sum as a hexadecimal string,
|
||||
- `subsequence(from,to)`: allows extracting a subsequence on a 0 based
|
||||
coordinate system, upper bound excluded like in go.
|
||||
- `reverse_complement`: returning a sequence object corresponding to the
|
||||
reverse complement of the current sequence.
|
||||
|
||||
### Change of git repositiory
|
||||
### Enhancement
|
||||
|
||||
- The OBITools4 git repository has been moved to the github repository.
|
||||
- All obitools now have a **--taxonomy** option. If specified, the taxonomy is
|
||||
loaded first and taxids annotating the sequences are validated against that
|
||||
taxonomy. A warning is issued for any invalid taxid and for any taxid that
|
||||
is transferred to a new taxid. The **--update-taxid** option allows these
|
||||
old taxids to be replaced with their new equivalent in the result of the
|
||||
obitools command.
|
||||
|
||||
- The scoring system used by the `obipairing` command has been changed to be
|
||||
more coherent. In the new version, the scores associated to a match and a
|
||||
mismatch involving a nucleotide with a quality score of 0 are equal. Which
|
||||
is normal as a zero quality score means a perfect indecision on the read
|
||||
nucleotide, therefore there is no reason to penalize a match differently
|
||||
from a mismatch (see
|
||||
https://obitools4.metabarcoding.org/docs/commands/alignments/obipairing/exact-alignment/).
|
||||
|
||||
- In every *OBITools* command, the progress bar is automatically deactivated
|
||||
when the standard error output is redirected.
|
||||
|
||||
- Because Genbank and ENA:EMBL contain very large sequences, while OBITools4
|
||||
are optimized As Genbank and ENA:EMBL contain very large sequences, while
|
||||
OBITools4 is optimized for short sequences, `obipcr` faces some problems
|
||||
with excessive consumption of computer resources, especially memory. Several
|
||||
improvements in the tuning of the default `obipcr` parameters and some new
|
||||
features, currently only available for FASTA and FASTQ file readers, have
|
||||
been implemented to limit the memory impact of `obipcr` without changing the
|
||||
computational efficiency too much.
|
||||
|
||||
- Logging system and therefore format, have been homogenized.
|
||||
|
||||
## August 2nd, 2024. Release 4.3.0
|
||||
|
||||
### Change of git repository
|
||||
|
||||
- The OBITools4 git repository has been moved to the GitHub repository.
|
||||
The new address is: https://github.com/metabarcoding/obitools4.
|
||||
Take care for using the new install script for retrieving the new version.
|
||||
|
||||
```bash
|
||||
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh \
|
||||
curl -L https://metabarcoding.org/obitools4/install.sh \
|
||||
| bash
|
||||
```
|
||||
|
||||
or with options:
|
||||
|
||||
```bash
|
||||
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh \
|
||||
curl -L https://metabarcoding.org/obitools4/install.sh \
|
||||
| bash -s -- --install-dir test_install --obitools-prefix k
|
||||
```
|
||||
|
||||
### CPU limitation
|
||||
|
||||
- By default, *OBITools4* tries to use all the computing power available on
|
||||
your computer. In some circumstances this can be problematic (e.g. if you
|
||||
are running on a computer cluster managed by your university). You can limit
|
||||
the number of CPU cores used by *OBITools4* or by using the **--max-cpu**
|
||||
option or by setting the **OBIMAXCPU** environment variable. Some strange
|
||||
behaviour of *OBITools4* has been observed when users try to limit the
|
||||
maximum number of usable CPU cores to one. This seems to be caused by the Go
|
||||
language, and it is not obvious to get *OBITools4* to run correctly on a
|
||||
single core in all circumstances. Therefore, if you ask to use a single
|
||||
core, **OBITools4** will print a warning message and actually set this
|
||||
parameter to two cores. If you really want a single core, you can use the
|
||||
**--force-one-core** option. But be aware that this can lead to incorrect
|
||||
calculations.
|
||||
|
||||
### New features
|
||||
|
||||
- The output of the obitools will evolve to produce results only in standard
|
||||
formats such as fasta and fastq. For non-sequential data, the output will be
|
||||
in CSV format, with the separator `,`, the decimal separator `.`, and a
|
||||
header line with the column names. It is more convenient to use the output
|
||||
in other programs. For example, you can use the `csvtomd` command to
|
||||
reformat the csv output into a markdown table. The first command to initiate
|
||||
reformat the CSV output into a Markdown table. The first command to initiate
|
||||
this change is `obicount`, which now produces a 3-line CSV output.
|
||||
|
||||
```bash
|
||||
@@ -96,7 +145,7 @@
|
||||
database for `obitag` is to use `obipcr` on a local copy of Genbank or EMBL.
|
||||
However, these sequence databases are known to contain many taxonomic
|
||||
errors, such as bacterial sequences annotated with the taxid of their host
|
||||
species. obicleandb tries to detect these errors. To do this, it first keeps
|
||||
species. `obicleandb` tries to detect these errors. To do this, it first keeps
|
||||
only sequences annotated with the taxid to which a species, genus, and
|
||||
family taxid can be assigned. Then, for each sequence, it compares the
|
||||
distance of the sequence to the other sequences belonging to the same genus
|
||||
@@ -107,7 +156,7 @@
|
||||
with the p-value of the Mann-Whitney U test in the **obicleandb_trusted**
|
||||
slot. Later, the distribution of this p-value can be analyzed to determine a
|
||||
threshold. Empirically, a threshold of 0.05 is a good compromise and allows
|
||||
to filter out less than 1‰ of the sequences. These sequences can then be
|
||||
filtering out less than 1‰ of the sequences. These sequences can then be
|
||||
removed using `obigrep`.
|
||||
|
||||
- Adds a new `obijoin` utility to join information contained in a sequence
|
||||
@@ -117,16 +166,16 @@
|
||||
|
||||
- Adds a new tool `obidemerge` to demerge a `merge_xxx` slot by recreating the
|
||||
multiple identical sequences having the slot `xxx` recreated with its initial
|
||||
value and the sequence count set to the number of occurences refered in the
|
||||
value and the sequence count set to the number of occurrences referred in the
|
||||
`merge_xxx` slot. During the operation, the `merge_xxx` slot is removed.
|
||||
|
||||
- Adds CSV as one of the input format for every obitools command. To encode
|
||||
sequence the CSV file must includes a column named `sequence` and another
|
||||
sequence the CSV file must include a column named `sequence` and another
|
||||
column named `id`. An extra column named `qualities` can be added to specify
|
||||
the quality scores of the sequence following the same ascii encoding than the
|
||||
the quality scores of the sequence following the same ASCII encoding than the
|
||||
fastq format. All the other columns will be considered as annotations and will
|
||||
be interpreted as JSON objects encoding potentially for atomic values. If a
|
||||
calumn value can not be decoded as JSON it will be considered as a string.
|
||||
column value can not be decoded as JSON it will be considered as a string.
|
||||
|
||||
- A new option **--version** has been added to every obitools command. It will
|
||||
print the version of the command.
|
||||
@@ -135,8 +184,8 @@
|
||||
quality scores from a BioSequence object.\
|
||||
|
||||
- In `obimultuplex` the ngsfilter file describing the samples can be no provided
|
||||
not only using the classical nfsfilter format but also using the csv format.
|
||||
When using csv, the first line must contain the column names. 5 columns are
|
||||
not only using the classical ngsfilter format but also using the CSV format.
|
||||
When using CSV, the first line must contain the column names. 5 columns are
|
||||
expected:
|
||||
|
||||
- `experiment` the name of the experiment
|
||||
@@ -152,43 +201,34 @@
|
||||
|
||||
Supplementary columns are allowed. Their names and content will be used to
|
||||
annotate the sequence corresponding to the sample, as the `key=value;` did
|
||||
in the nfsfilter format.
|
||||
in the ngsfilter format.
|
||||
|
||||
The CSV format used allows for comment lines starting with `#` character.
|
||||
Special data lines starting with `@param` in the first column allow to
|
||||
configure the algorithm. The options **--template** provided an over
|
||||
commented example of the csv format, including all the possible options.
|
||||
Special data lines starting with `@param` in the first column allow configuring the algorithm. The options **--template** provided an over
|
||||
commented example of the CSV format, including all the possible options.
|
||||
|
||||
### CPU limitation
|
||||
|
||||
### Enhancement
|
||||
- By default, *OBITools4* tries to use all the computing power available on
|
||||
your computer. In some circumstances this can be problematic (e.g. if you
|
||||
are running on a computer cluster managed by your university). You can limit
|
||||
the number of CPU cores used by *OBITools4* or by using the **--max-cpu**
|
||||
option or by setting the **OBIMAXCPU** environment variable. Some strange
|
||||
behavior of *OBITools4* has been observed when users try to limit the
|
||||
maximum number of usable CPU cores to one. This seems to be caused by the Go
|
||||
language, and it is not obvious to get *OBITools4* to run correctly on a
|
||||
single core in all circumstances. Therefore, if you ask to use a single
|
||||
core, **OBITools4** will print a warning message and actually set this
|
||||
parameter to two cores. If you really want a single core, you can use the
|
||||
**--force-one-core** option. But be aware that this can lead to incorrect
|
||||
calculations.
|
||||
|
||||
- In every *OBITools* command, the progress bar are automatically deactivated
|
||||
when the standard error output is redirected.
|
||||
- Because Genbank and ENA:EMBL contain very large sequences, while OBITools4
|
||||
are optimized As Genbank and ENA:EMBL contain very large sequences, while
|
||||
OBITools4 is optimised for short sequences, `obipcr` faces some problems
|
||||
with excessive consumption of computer resources, especially memory. Several
|
||||
improvements in the tuning of the default `obipcr` parameters and some new
|
||||
features, currently only available for FASTA and FASTQ file readers, have
|
||||
been implemented to limit the memory impact of `obipcr` without changing the
|
||||
computational efficiency too much.
|
||||
- Logging system and therefore format, have been homogenized.
|
||||
|
||||
### Bug
|
||||
|
||||
- In `obitag`, correct the wrong assignment of the **obitag_bestmatch**
|
||||
attribute.
|
||||
- In `obiclean`, the **--no-progress-bar** option disables all progress bars,
|
||||
not just the data.
|
||||
- Several fixes in reading FASTA and FASTQ files, including some code
|
||||
simplification and and factorization.
|
||||
- Fixed a bug in all obitools that caused the same file to be processed
|
||||
multiple times. when specifying a directory name as input.
|
||||
|
||||
## April 2nd, 2024. Release 4.2.0
|
||||
|
||||
### New features
|
||||
|
||||
- A new OBITools named `obiscript` allows to process each sequence according
|
||||
- A new OBITools named `obiscript` allows processing each sequence according
|
||||
to a Lua script. This is an experimental tool. The **--template** option
|
||||
allows for generating an example script on the `stdout`.
|
||||
|
||||
@@ -196,7 +236,7 @@
|
||||
|
||||
- Two of the main class `obiseq.SeqWorker` and `obiseq.SeqWorker` have their
|
||||
declaration changed. Both now return two values a `obiseq.BioSequenceSlice`
|
||||
and an `error`. This allow a worker to return potentially several sequences
|
||||
and an `error`. This allows a worker to return potentially several sequences
|
||||
as the result of the processing of a single sequence, or zero, which is
|
||||
equivalent to filter out the input sequence.
|
||||
|
||||
@@ -204,12 +244,12 @@
|
||||
|
||||
- In `obitag` if the reference database contains sequences annotated by taxid
|
||||
not referenced in the taxonomy, the corresponding sequences are discarded
|
||||
from the reference database and a warning indicating the sequence id and the
|
||||
from the reference database and a warning indicating the sequence *id* and the
|
||||
wrong taxid is emitted.
|
||||
- The bug corrected in the parsing of EMBL and Genbank files as implemented in
|
||||
version 4.1.2 of OBITools4, potentially induced some reduction in the
|
||||
performance of the parsing. This should have been now fixed.
|
||||
- In the same idea, parsing of genbank and EMBL files were reading and storing
|
||||
- In the same idea, parsing of Genbank and EMBL files were reading and storing
|
||||
in memory not only the sequence but also the annotations (features table).
|
||||
Up to now none of the OBITools are using this information, but with large
|
||||
complete genomes, it is occupying a lot of memory. To reduce this impact,
|
||||
@@ -248,7 +288,7 @@
|
||||
|
||||
### New feature
|
||||
|
||||
- In `obimatrix` a **--transpose** option allows to transpose the produced
|
||||
- In `obimatrix` a **--transpose** option allows transposing the produced
|
||||
matrix table in CSV format.
|
||||
- In `obitpairing` and `obipcrtag` two new options **--exact-mode** and
|
||||
**--fast-absolute** to control the heuristic used in the alignment
|
||||
@@ -256,7 +296,7 @@
|
||||
the exact algorithm at the cost of a speed. **--fast-absolute** change the
|
||||
scoring schema of the heuristic.
|
||||
- In `obiannotate` adds the possibility to annotate the first match of a
|
||||
pattern using the same algorithm than the one used in `obipcr` and
|
||||
pattern using the same algorithm as the one used in `obipcr` and
|
||||
`obimultiplex`. For that four option were added :
|
||||
- **--pattern** : to specify the pattern. It can use IUPAC codes and
|
||||
position with no error tolerated has to be followed by a `#` character.
|
||||
@@ -337,7 +377,7 @@
|
||||
|
||||
### Bugs
|
||||
|
||||
- in the obitools language, the `composition` function now returns a map
|
||||
- In the obitools language, the `composition` function now returns a map
|
||||
indexed by lowercase string "a", "c", "g", "t" and "o" for other instead of
|
||||
being indexed by the ASCII codes of the corresponding letters.
|
||||
- Correction of the reverse-complement operation. Every reverse complement of
|
||||
@@ -350,18 +390,18 @@
|
||||
duplicating the quality values. This made `obimultiplex` to produce fastq
|
||||
files with sequences having quality values duplicated.
|
||||
|
||||
### Becareful
|
||||
### Be careful
|
||||
|
||||
GO 1.21.0 is out, and it includes new functionalities which are used in the
|
||||
OBITools4 code. If you use the recommanded method for compiling OBITools on your
|
||||
computer, their is no problem, as the script always load the latest GO version.
|
||||
If you rely on you personnal GO install, please think to update.
|
||||
OBITools4 code. If you use the recommended method for compiling OBITools on your
|
||||
computer, there is no problem, as the script always load the latest GO version.
|
||||
If you rely on your personal GO install, please think to update.
|
||||
|
||||
## August 29th, 2023. Release 4.0.5
|
||||
|
||||
### Bugs
|
||||
|
||||
- Patch a bug in the `obiseq.BioSequence` constructor leading to a error on
|
||||
- Patch a bug in the `obiseq.BioSequence` constructor leading to an error on
|
||||
almost every obitools. The error message indicates : `fatal error: sync:
|
||||
unlock of unlocked mutex` This bug was introduced in the release 4.0.4
|
||||
|
||||
@@ -380,7 +420,7 @@ If you rely on you personnal GO install, please think to update.
|
||||
data structure to limit the number of alignments actually computed. This
|
||||
increase a bit the speed of both the software. `obirefidx` is nevertheless
|
||||
still too slow compared to my expectation.
|
||||
- Switch to a parallel version of the gzip library, allowing for high speed
|
||||
- Switch to a parallel version of the GZIP library, allowing for high speed
|
||||
compress and decompress operation on files.
|
||||
|
||||
### New feature
|
||||
@@ -424,12 +464,12 @@ If you rely on you personnal GO install, please think to update.
|
||||
--unidentified not_assigned.fastq
|
||||
```
|
||||
|
||||
the command produced four files : `tagged_library_R1.fastq` and
|
||||
The command produced four files : `tagged_library_R1.fastq` and
|
||||
`tagged_library_R2.fastq` containing the assigned reads and
|
||||
`not_assigned_R1.fastq` and `not_assigned_R2.fastq` containing the
|
||||
unassignable reads.
|
||||
|
||||
the tagged library files can then be split using `obidistribute`:
|
||||
The tagged library files can then be split using `obidistribute`:
|
||||
|
||||
```{bash}
|
||||
mkdir pcr_reads
|
||||
@@ -439,9 +479,9 @@ If you rely on you personnal GO install, please think to update.
|
||||
|
||||
- Adding of two options **--add-lca-in** and **--lca-error** to `obiannotate`.
|
||||
These options aim to help during construction of reference database using
|
||||
`obipcr`. On obipcr output, it is commonly run obiuniq. To merge identical
|
||||
`obipcr`. On `obipcr` output, it is commonly run `obiuniq`. To merge identical
|
||||
sequences annotated with different taxids, it is now possible to use the
|
||||
following strategie :
|
||||
following strategies :
|
||||
|
||||
```{bash}
|
||||
obiuniq -m taxid myrefdb.obipcr.fasta \
|
||||
@@ -472,7 +512,7 @@ If you rely on you personnal GO install, please think to update.
|
||||
- Correction of a bug in `obiconsensus` leading into the deletion of a base
|
||||
close to the beginning of the consensus sequence.
|
||||
|
||||
## March 31th, 2023. Release 4.0.2
|
||||
## March 31st, 2023. Release 4.0.2
|
||||
|
||||
### Compiler change
|
||||
|
||||
@@ -483,15 +523,15 @@ If you rely on you personnal GO install, please think to update.
|
||||
- Add the possibility for looking pattern with indels. This has been added to
|
||||
`obimultiplex` through the **--with-indels** option.
|
||||
- Every obitools command has a **--pprof** option making the command
|
||||
publishing a profiling web site available at the address :
|
||||
publishing a profiling website available at the address :
|
||||
<http://localhost:8080/debug/pprof/>
|
||||
- A new `obiconsensus` command has been added. It is a prototype. It aims to
|
||||
build a consensus sequence from a set of reads. The consensus is estimated
|
||||
for all the sequences contained in the input file. If several input files,
|
||||
or a directory name are provided the result contains a consensus per file.
|
||||
The id of the sequence is the name of the input file depleted of its
|
||||
The *id* of the sequence is the name of the input file depleted of its
|
||||
directory name and of all its extensions.
|
||||
- In `obipcr` an experimental option **--fragmented** allows for spliting very
|
||||
- In `obipcr` an experimental option **--fragmented** allows for splitting very
|
||||
long query sequences into shorter fragments with an overlap between the two
|
||||
contiguous fragment insuring that no amplicons are missed despite the split.
|
||||
As a site effect some amplicon can be identified twice.
|
||||
@@ -534,7 +574,7 @@ If you rely on you personnal GO install, please think to update.
|
||||
### Enhancement
|
||||
|
||||
- *OBITools* are automatically processing all the sequences files contained in
|
||||
a directory and its sub-directory\
|
||||
a directory and its subdirectory\
|
||||
recursively if its name is provided as input. To process easily Genbank
|
||||
files, the corresponding filename extensions have been added. Today the
|
||||
following extensions are recognized as sequence files : `.fasta`, `.fastq`,
|
||||
@@ -551,7 +591,7 @@ If you rely on you personnal GO install, please think to update.
|
||||
export OBICPUMAX=4
|
||||
```
|
||||
|
||||
- Adds a new option --out\|-o allowing to specify the name of an outpout file.
|
||||
- Adds a new option --out\|-o allowing to specify the name of an output file.
|
||||
|
||||
``` bash
|
||||
obiconvert -o xyz.fasta xxx.fastq
|
||||
@@ -573,10 +613,10 @@ If you rely on you personnal GO install, please think to update.
|
||||
matched files remain consistent when processed.
|
||||
|
||||
- Adding of the function `ifelse` to the expression language for computing
|
||||
conditionnal values.
|
||||
conditional values.
|
||||
|
||||
- Adding two function to the expression language related to sequence
|
||||
conposition : `composition` and `gcskew`. Both are taking a sequence as
|
||||
composition : `composition` and `gcskew`. Both are taking a sequence as
|
||||
single argument.
|
||||
|
||||
## February 18th, 2023. Release 4.0.0
|
||||
@@ -584,8 +624,8 @@ If you rely on you personnal GO install, please think to update.
|
||||
It is the first version of the *OBITools* version 4. I decided to tag then
|
||||
following two weeks of intensive data analysis with them allowing to discover
|
||||
many small bugs present in the previous non-official version. Obviously other
|
||||
bugs are certainly persent in the code, and you are welcome to use the git
|
||||
ticket system to mention them. But they seems to produce now reliable results.
|
||||
bugs are certainly present in the code, and you are welcome to use the git
|
||||
ticket system to mention them. But they seem to produce now reliable results.
|
||||
|
||||
### Corrected bugs
|
||||
|
||||
@@ -593,11 +633,11 @@ ticket system to mention them. But they seems to produce now reliable results.
|
||||
of sequences and to the production of incorrect file because of the last
|
||||
sequence record, sometime truncated in its middle. This was only occurring
|
||||
when more than a single CPU was used. It was affecting every obitools.
|
||||
- The `obiparing` software had a bug in the right aligment procedure. This led
|
||||
to the non alignment of very sort barcode during the paring of the forward
|
||||
- The `obiparing` software had a bug in the right alignment procedure. This led
|
||||
to the non-alignment of very sort barcode during the paring of the forward
|
||||
and reverse reads.
|
||||
- The `obipairing` tools had a non deterministic comportment when aligning a
|
||||
paor very low quality reads. This induced that the result of the same low
|
||||
- The `obipairing` tools had a non-deterministic comportment when aligning a
|
||||
pair very low quality reads. This induced that the result of the same low
|
||||
quality read pair was not the same from run to run.
|
||||
|
||||
### New features
|
||||
@@ -605,11 +645,10 @@ ticket system to mention them. But they seems to produce now reliable results.
|
||||
- Adding of a `--compress|-Z` option to every obitools allowing to produce
|
||||
`gz` compressed output. OBITools were already able to deal with gziped input
|
||||
files transparently. They can now produce their results in the same format.
|
||||
- Adding of a `--append|-A` option to the `obidistribute` tool. It allows to
|
||||
append the result of an `obidistribute` execution to preexisting files. -
|
||||
- Adding of a `--append|-A` option to the `obidistribute` tool. It allows appending the result of an `obidistribute` execution to preexisting files. -
|
||||
Adding of a `--directory|-d` option to the `obidistribute` tool. It allows
|
||||
to declare a secondary classification key over the one defined by the
|
||||
'--category\|-c\` option. This extra key leads to produce directories in
|
||||
declaring a secondary classification key over the one defined by the
|
||||
`--category\|-c\` option. This extra key leads to produce directories in
|
||||
which files produced according to the primary criterion are stored.
|
||||
- Adding of the functions `subspc`, `printf`, `int`, `numeric`, and `bool` to
|
||||
the expression language.
|
||||
@@ -47,12 +47,27 @@ func main() {
|
||||
obiconvert.OpenSequenceDataErrorMessage(args, err)
|
||||
|
||||
taxo := obitax.DefaultTaxonomy()
|
||||
|
||||
references := obitag.CLIRefDB()
|
||||
|
||||
if references == nil {
|
||||
log.Panicln("No loaded reference database")
|
||||
}
|
||||
|
||||
if taxo == nil {
|
||||
taxo, err = references.ExtractTaxonomy(nil)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("No taxonomy specified or extractable from reference database: %v", err)
|
||||
}
|
||||
|
||||
taxo.SetAsDefault()
|
||||
}
|
||||
|
||||
if taxo == nil {
|
||||
log.Panicln("No loaded taxonomy")
|
||||
}
|
||||
|
||||
references := obitag.CLIRefDB()
|
||||
|
||||
var identified obiiter.IBioSequence
|
||||
|
||||
if obitag.CLIGeometricMode() {
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obitaxonomy"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@@ -18,17 +21,49 @@ func main() {
|
||||
var iterator *obitax.ITaxon
|
||||
|
||||
switch {
|
||||
case obitaxonomy.CLIDownloadNCBI():
|
||||
err := obitaxonomy.CLIDownloadNCBITaxdump()
|
||||
if err != nil {
|
||||
log.Errorf("Cannot download NCBI taxonomy: %s", err.Error())
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
os.Exit(0)
|
||||
|
||||
case obitaxonomy.CLIExtractTaxonomy():
|
||||
iter, err := obiconvert.CLIReadBioSequences(args...)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot extract taxonomy: %v", err)
|
||||
}
|
||||
|
||||
taxonomy, err := iter.ExtractTaxonomy()
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot extract taxonomy: %v", err)
|
||||
}
|
||||
|
||||
taxonomy.SetAsDefault()
|
||||
|
||||
log.Infof("Number of extracted taxa: %d", taxonomy.Len())
|
||||
iterator = taxonomy.AsTaxonSet().Sort().Iterator()
|
||||
|
||||
case obitaxonomy.CLIDumpSubtaxonomy():
|
||||
iterator = obitaxonomy.CLISubTaxonomyIterator()
|
||||
|
||||
case obitaxonomy.CLIRequestsPathForTaxid() != "NA":
|
||||
|
||||
taxon := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid())
|
||||
taxon, isAlias, err := obitax.DefaultTaxonomy().Taxon(obitaxonomy.CLIRequestsPathForTaxid())
|
||||
|
||||
if taxon == nil {
|
||||
log.Fatalf("Cannot identify the requested taxon: %s",
|
||||
obitaxonomy.CLIRequestsPathForTaxid())
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot identify the requested taxon: %s (%v)",
|
||||
obitaxonomy.CLIRequestsPathForTaxid(), err)
|
||||
}
|
||||
|
||||
if isAlias {
|
||||
if obidefault.FailOnTaxonomy() {
|
||||
log.Fatalf("Taxon %s is an alias for %s", taxon.String(), taxon.Parent().String())
|
||||
}
|
||||
}
|
||||
|
||||
s := taxon.Path()
|
||||
|
||||
4
go.mod
4
go.mod
@@ -5,7 +5,9 @@ go 1.23.1
|
||||
require (
|
||||
github.com/DavidGamba/go-getoptions v0.28.0
|
||||
github.com/PaesslerAG/gval v1.2.2
|
||||
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9
|
||||
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df
|
||||
github.com/buger/jsonparser v1.1.1
|
||||
github.com/chen3feng/stl4go v0.1.1
|
||||
github.com/dlclark/regexp2 v1.11.4
|
||||
github.com/goccy/go-json v0.10.3
|
||||
@@ -24,8 +26,6 @@ require (
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/Clever/csvlint v0.3.0 // indirect
|
||||
github.com/buger/jsonparser v1.1.1 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
|
||||
github.com/kr/pretty v0.3.0 // indirect
|
||||
|
||||
5
go.sum
5
go.sum
@@ -1,11 +1,11 @@
|
||||
github.com/Clever/csvlint v0.3.0 h1:58WEFXWy+i0fCbxTXscR2QwYESRuAUFjEGLgZs6j2iU=
|
||||
github.com/Clever/csvlint v0.3.0/go.mod h1:+wLRuW/bI8NhpRoeyUBxqKsK35OhvgJhXHSWdKp5XJU=
|
||||
github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c=
|
||||
github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
|
||||
github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E=
|
||||
github.com/PaesslerAG/gval v1.2.2/go.mod h1:XRFLwvmkTEdYziLdaCeCa5ImcGVrfQbeNUbVR+C6xac=
|
||||
github.com/PaesslerAG/jsonpath v0.1.0 h1:gADYeifvlqK3R3i2cR5B4DGgxLXIPb3TRTH1mGi0jPI=
|
||||
github.com/PaesslerAG/jsonpath v0.1.0/go.mod h1:4BzmtoM/PI8fPO4aQGIusjGxGir2BzcV0grWtFzq1Y8=
|
||||
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9 h1:Zc1/GNsUpgZR9qm1EmRSKrnOHA7CCd0bIzGdq0cREN0=
|
||||
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9/go.mod h1:PZyV4WA3NpqtezSY0h6E6NARAmdDm0qwrydveOyR5Gc=
|
||||
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0=
|
||||
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM=
|
||||
github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMUs=
|
||||
@@ -69,7 +69,6 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ
|
||||
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
|
||||
144
obitests/obitools/obicount/test.sh
Executable file
144
obitests/obitools/obicount/test.sh
Executable file
@@ -0,0 +1,144 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Here give the name of the test serie
|
||||
#
|
||||
|
||||
TEST_NAME=obicount
|
||||
|
||||
######
|
||||
#
|
||||
# Some variable and function definitions: please don't change them
|
||||
#
|
||||
######
|
||||
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
|
||||
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
|
||||
export PATH="${OBITOOLS_DIR}:${PATH}"
|
||||
|
||||
|
||||
TMPDIR="$(mktemp -d)"
|
||||
ntest=0
|
||||
success=0
|
||||
failed=0
|
||||
|
||||
cleanup() {
|
||||
echo "========================================" 1>&2
|
||||
echo "## Results of the $TEST_NAME tests:" 1>&2
|
||||
|
||||
echo 1>&2
|
||||
echo "- $ntest tests run" 1>&2
|
||||
echo "- $success successfully completed" 1>&2
|
||||
echo "- $failed failed tests" 1>&2
|
||||
echo 1>&2
|
||||
echo "Cleaning up the temporary directory..." 1>&2
|
||||
echo 1>&2
|
||||
echo "========================================" 1>&2
|
||||
|
||||
rm -rf "$TMPDIR" # Suppress the temporary directory
|
||||
|
||||
if [ $failed -gt 0 ]; then
|
||||
log "$TEST_NAME tests failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
}
|
||||
|
||||
log() {
|
||||
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
||||
}
|
||||
|
||||
log "Testing $TEST_NAME..."
|
||||
log "Test directory is $TEST_DIR"
|
||||
log "obitools directory is $OBITOOLS_DIR"
|
||||
log "Temporary directory is $TMPDIR"
|
||||
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||
|
||||
######################################################################
|
||||
####
|
||||
#### Below are the tests
|
||||
####
|
||||
#### Before each test :
|
||||
#### - increment the variable ntest
|
||||
####
|
||||
#### Run the command as the condition of an if / then /else
|
||||
#### - The command must return 0 on success
|
||||
#### - The command must return an exit code different from 0 on failure
|
||||
#### - The datafiles are stored in the same directory than the test script
|
||||
#### - The test script directory is stored in the TEST_DIR variable
|
||||
#### - If result files have to be produced they must be stored
|
||||
#### in the temporary directory (TMPDIR variable)
|
||||
####
|
||||
#### then clause is executed on success of the command
|
||||
#### - Write a success message using the log function
|
||||
#### - increment the variable success
|
||||
####
|
||||
#### else clause is executed on failure of the command
|
||||
#### - Write a failure message using the log function
|
||||
#### - increment the variable failed
|
||||
####
|
||||
######################################################################
|
||||
|
||||
((ntest++))
|
||||
if obicount "${TEST_DIR}/wolf_F.fasta.gz" \
|
||||
> "${TMPDIR}/wolf_F.fasta_count.csv"
|
||||
then
|
||||
log "OBICount: fasta reading OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: fasta reading failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obicount "${TEST_DIR}/wolf_F.fastq.gz" \
|
||||
> "${TMPDIR}/wolf_F.fastq_count.csv"
|
||||
then
|
||||
log "OBICount: fastq reading OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: fastq reading failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obicount "${TEST_DIR}/wolf_F.csv.gz" \
|
||||
> "${TMPDIR}/wolf_F.csv_count.csv"
|
||||
then
|
||||
log "OBICount: csv reading OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: csv reading failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if diff "${TMPDIR}/wolf_F.fasta_count.csv" \
|
||||
"${TMPDIR}/wolf_F.fastq_count.csv" > /dev/null
|
||||
then
|
||||
log "OBICount: counting on fasta and fastq are identical OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: counting on fasta and fastq are different failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if diff "${TMPDIR}/wolf_F.fasta_count.csv" \
|
||||
"${TMPDIR}/wolf_F.csv_count.csv" > /dev/null
|
||||
then
|
||||
log "OBICount: counting on fasta and csv are identical OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: counting on fasta and csv are different failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
#########################################
|
||||
#
|
||||
# At the end of the tests
|
||||
# the cleanup function is called
|
||||
#
|
||||
#########################################
|
||||
|
||||
cleanup
|
||||
BIN
obitests/obitools/obicount/wolf_F.csv.gz
Normal file
BIN
obitests/obitools/obicount/wolf_F.csv.gz
Normal file
Binary file not shown.
BIN
obitests/obitools/obicount/wolf_F.fasta.gz
Normal file
BIN
obitests/obitools/obicount/wolf_F.fasta.gz
Normal file
Binary file not shown.
BIN
obitests/obitools/obicount/wolf_F.fastq.gz
Normal file
BIN
obitests/obitools/obicount/wolf_F.fastq.gz
Normal file
Binary file not shown.
134
obitests/obitools/obiparing/test.sh
Executable file
134
obitests/obitools/obiparing/test.sh
Executable file
@@ -0,0 +1,134 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Here give the name of the test serie
|
||||
#
|
||||
|
||||
TEST_NAME=obiparing
|
||||
|
||||
######
|
||||
#
|
||||
# Some variable and function definitions: please don't change them
|
||||
#
|
||||
######
|
||||
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
|
||||
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
|
||||
export PATH="${OBITOOLS_DIR}:${PATH}"
|
||||
|
||||
|
||||
TMPDIR="$(mktemp -d)"
|
||||
ntest=0
|
||||
success=0
|
||||
failed=0
|
||||
|
||||
cleanup() {
|
||||
echo "========================================" 1>&2
|
||||
echo "## Results of the $TEST_NAME tests:" 1>&2
|
||||
|
||||
echo 1>&2
|
||||
echo "- $ntest tests run" 1>&2
|
||||
echo "- $success successfully completed" 1>&2
|
||||
echo "- $failed failed tests" 1>&2
|
||||
echo 1>&2
|
||||
echo "Cleaning up the temporary directory..." 1>&2
|
||||
echo 1>&2
|
||||
echo "========================================" 1>&2
|
||||
|
||||
rm -rf "$TMPDIR" # Suppress the temporary directory
|
||||
|
||||
if [ $failed -gt 0 ]; then
|
||||
log "$TEST_NAME tests failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
}
|
||||
|
||||
log() {
|
||||
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
||||
}
|
||||
|
||||
log "Testing $TEST_NAME..."
|
||||
log "Test directory is $TEST_DIR"
|
||||
log "obitools directory is $OBITOOLS_DIR"
|
||||
log "Temporary directory is $TMPDIR"
|
||||
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||
|
||||
######################################################################
|
||||
####
|
||||
#### Below are the tests
|
||||
####
|
||||
#### Before each test :
|
||||
#### - increment the variable ntest
|
||||
####
|
||||
#### Run the command as the condition of an if / then /else
|
||||
#### - The command must return 0 on success
|
||||
#### - The command must return an exit code different from 0 on failure
|
||||
#### - The datafiles are stored in the same directory than the test script
|
||||
#### - The test script directory is stored in the TEST_DIR variable
|
||||
#### - If result files have to be produced they must be stored
|
||||
#### in the temporary directory (TMPDIR variable)
|
||||
####
|
||||
#### then clause is executed on success of the command
|
||||
#### - Write a success message using the log function
|
||||
#### - increment the variable success
|
||||
####
|
||||
#### else clause is executed on failure of the command
|
||||
#### - Write a failure message using the log function
|
||||
#### - increment the variable failed
|
||||
####
|
||||
######################################################################
|
||||
|
||||
((ntest++))
|
||||
if obipairing -F "${TEST_DIR}/wolf_F.fastq.gz" \
|
||||
-R "${TEST_DIR}/wolf_R.fastq.gz" \
|
||||
| obidistribute -Z -c mode \
|
||||
-p "${TMPDIR}/wolf_paired_%s.fastq.gz"
|
||||
then
|
||||
log "OBIPairing: sequence pairing OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIPairing: sequence pairing failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obicsv -Z -s -i \
|
||||
-k ali_dir -k ali_length -k paring_fast_count \
|
||||
-k paring_fast_overlap -k paring_fast_score \
|
||||
-k score -k score_norm -k seq_a_single \
|
||||
-k seq_b_single -k seq_ab_match \
|
||||
"${TMPDIR}/wolf_paired_alignment.fastq.gz" \
|
||||
> "${TMPDIR}/wolf_paired_alignment.csv.gz" \
|
||||
&& zdiff -c "${TEST_DIR}/wolf_paired_alignment.csv.gz" \
|
||||
"${TMPDIR}/wolf_paired_alignment.csv.gz"
|
||||
then
|
||||
log "OBIPairing: check aligned sequences OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIPairing: check aligned sequences failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obicsv -Z -s -i \
|
||||
"${TMPDIR}/wolf_paired_join.fastq.gz" \
|
||||
> "${TMPDIR}/wolf_paired_join.csv.gz" \
|
||||
&& zdiff -c "${TEST_DIR}/wolf_paired_join.csv.gz" \
|
||||
"${TMPDIR}/wolf_paired_join.csv.gz"
|
||||
then
|
||||
log "OBIPairing: check joined sequences OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIPairing: check joined sequences failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
#########################################
|
||||
#
|
||||
# At the end of the tests
|
||||
# the cleanup function is called
|
||||
#
|
||||
#########################################
|
||||
|
||||
cleanup
|
||||
BIN
obitests/obitools/obiparing/wolf_F.fastq.gz
Normal file
BIN
obitests/obitools/obiparing/wolf_F.fastq.gz
Normal file
Binary file not shown.
BIN
obitests/obitools/obiparing/wolf_R.fastq.gz
Normal file
BIN
obitests/obitools/obiparing/wolf_R.fastq.gz
Normal file
Binary file not shown.
BIN
obitests/obitools/obiparing/wolf_paired_alignment.csv.gz
Normal file
BIN
obitests/obitools/obiparing/wolf_paired_alignment.csv.gz
Normal file
Binary file not shown.
BIN
obitests/obitools/obiparing/wolf_paired_join.csv.gz
Normal file
BIN
obitests/obitools/obiparing/wolf_paired_join.csv.gz
Normal file
Binary file not shown.
@@ -10,6 +10,7 @@ import (
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
// // A pool of byte slices.
|
||||
@@ -158,12 +159,30 @@ func BuildQualityConsensus(seqA, seqB *obiseq.BioSequence, path []int, statOnMis
|
||||
|
||||
match := 0
|
||||
|
||||
left := obiutils.Abs(path[0])
|
||||
right := 0
|
||||
if path[len(path)-1] == 0 {
|
||||
right = path[len(path)-2]
|
||||
}
|
||||
|
||||
right = obiutils.Abs(right)
|
||||
|
||||
right = len(*bufferQA) - right
|
||||
|
||||
// log.Warnf("BuildQualityConsensus: left = %d right = %d\n", left, right)
|
||||
|
||||
for i, qA = range *bufferQA {
|
||||
nA := (*bufferSA)[i]
|
||||
nB := (*bufferSB)[i]
|
||||
qB = (*bufferQB)[i]
|
||||
|
||||
if statOnMismatch && nA != nB && nA != ' ' && nB != ' ' {
|
||||
if statOnMismatch && i >= left && i < right && nA != nB {
|
||||
if nA == ' ' {
|
||||
nA = '-'
|
||||
}
|
||||
if nB == ' ' {
|
||||
nB = '-'
|
||||
}
|
||||
mismatches[strings.ToUpper(fmt.Sprintf("(%c:%02d)->(%c:%02d)", nA, qA, nB, qB))] = i + 1
|
||||
}
|
||||
|
||||
@@ -183,13 +202,12 @@ func BuildQualityConsensus(seqA, seqB *obiseq.BioSequence, path []int, statOnMis
|
||||
|
||||
q := qA + qB
|
||||
|
||||
if qA > 0 && qB > 0 {
|
||||
if nA != nB {
|
||||
q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/30))*10+0.5)
|
||||
}
|
||||
if nA == nB {
|
||||
match++
|
||||
}
|
||||
if nA != nB {
|
||||
q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/40))*10+0.5)
|
||||
}
|
||||
|
||||
if nA == nB {
|
||||
match++
|
||||
}
|
||||
|
||||
if q > 90 {
|
||||
|
||||
@@ -74,6 +74,30 @@ func _Logaddexp(a, b float64) float64 {
|
||||
return b + math.Log1p(math.Exp(a-b))
|
||||
}
|
||||
|
||||
func _Log1mexp(a float64) float64 {
|
||||
if a > 0 {
|
||||
log.Panic("Log1mexp: a > 0")
|
||||
}
|
||||
|
||||
if a == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
return (math.Log(-math.Expm1(a)))
|
||||
}
|
||||
|
||||
func _Logdiffexp(a, b float64) float64 {
|
||||
if a < b {
|
||||
log.Panic("Log1mexp: a < b")
|
||||
}
|
||||
|
||||
if a == b {
|
||||
return math.Inf(-1)
|
||||
}
|
||||
|
||||
return a + _Log1mexp(b-a)
|
||||
}
|
||||
|
||||
// _MatchScoreRatio calculates the match score ratio between two bytes.
|
||||
//
|
||||
// Parameters:
|
||||
@@ -83,25 +107,25 @@ func _Logaddexp(a, b float64) float64 {
|
||||
// Returns:
|
||||
// - float64: the match score ratio when a match is observed
|
||||
// - float64: the match score ratio when a mismatch is observed
|
||||
func _MatchScoreRatio(a, b byte) (float64, float64) {
|
||||
func _MatchScoreRatio(QF, QR byte) (float64, float64) {
|
||||
|
||||
l2 := math.Log(2)
|
||||
l3 := math.Log(3)
|
||||
l4 := math.Log(4)
|
||||
l10 := math.Log(10)
|
||||
lalea := math.Log(4) // 1 /(change of the random model)
|
||||
lE1 := -float64(a)/10*l10 - l3 // log proba of sequencing error on A/3
|
||||
lE2 := -float64(b)/10*l10 - l3 // log proba of sequencing error on B/3
|
||||
lO1 := math.Log1p(-math.Exp(lE1 + l3)) // log proba no being an error on A
|
||||
lO2 := math.Log1p(-math.Exp(lE2 + l3)) // log proba no being an error on B
|
||||
lO1O2 := lO1 + lO2
|
||||
lE1E2 := lE1 + lE2
|
||||
lO1E2 := lO1 + lE2
|
||||
lO2E1 := lO2 + lE1
|
||||
qF := -float64(QF) / 10 * l10
|
||||
qR := -float64(QR) / 10 * l10
|
||||
term1 := _Logaddexp(qF, qR)
|
||||
term2 := _Logdiffexp(term1, qF+qR)
|
||||
|
||||
MM := _Logaddexp(lO1O2, lE1E2+l3) // Proba match when match observed
|
||||
Mm := _Logaddexp(_Logaddexp(lO1E2, lO2E1), lE1E2+l2) // Proba match when mismatch observed
|
||||
// log.Warnf("MatchScoreRatio: %v, %v , %v, %v", QF, QR, term1, term2)
|
||||
|
||||
return MM + lalea, Mm + lalea
|
||||
match_logp := _Log1mexp(term2 + l3 - l4)
|
||||
match_score := match_logp - _Log1mexp(match_logp)
|
||||
|
||||
mismatch_logp := term2 - l4
|
||||
mismatch_score := mismatch_logp - _Log1mexp(mismatch_logp)
|
||||
|
||||
return match_score, mismatch_score
|
||||
}
|
||||
|
||||
func _InitNucPartMatch() {
|
||||
|
||||
@@ -21,15 +21,15 @@ func encodeValues(score, length int, out bool) uint64 {
|
||||
return fo
|
||||
}
|
||||
|
||||
func _isout(value uint64) bool {
|
||||
const outmask = uint64(1) << dwsize
|
||||
return (value & outmask) == 0
|
||||
}
|
||||
// func _isout(value uint64) bool {
|
||||
// const outmask = uint64(1) << dwsize
|
||||
// return (value & outmask) == 0
|
||||
// }
|
||||
|
||||
func _lpath(value uint64) int {
|
||||
const mask = uint64(1<<wsize) - 1
|
||||
return int(((value + 1) ^ mask) & mask)
|
||||
}
|
||||
// func _lpath(value uint64) int {
|
||||
// const mask = uint64(1<<wsize) - 1
|
||||
// return int(((value + 1) ^ mask) & mask)
|
||||
// }
|
||||
|
||||
func decodeValues(value uint64) (int, int, bool) {
|
||||
const mask = uint64(1<<wsize) - 1
|
||||
@@ -57,4 +57,3 @@ func _setout(value uint64) uint64 {
|
||||
var _empty = encodeValues(0, 0, false)
|
||||
var _out = encodeValues(0, 30000, true)
|
||||
var _notavail = encodeValues(0, 30000, false)
|
||||
|
||||
|
||||
@@ -625,6 +625,8 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
|
||||
&arena.pointer.scoreMatrix,
|
||||
&arena.pointer.pathMatrix)
|
||||
|
||||
score = scoreR
|
||||
|
||||
path = _Backtracking(arena.pointer.pathMatrix,
|
||||
len(rawSeqA), len(rawSeqB),
|
||||
&(arena.pointer.path))
|
||||
@@ -641,6 +643,7 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
|
||||
len(rawSeqA), len(rawSeqB),
|
||||
&(arena.pointer.path))
|
||||
isLeftAlign = true
|
||||
score = scoreL
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -2,6 +2,8 @@ package obidefault
|
||||
|
||||
var __taxonomy__ = ""
|
||||
var __alternative_name__ = false
|
||||
var __fail_on_taxonomy__ = false
|
||||
var __update_taxid__ = false
|
||||
|
||||
func SelectedTaxonomy() string {
|
||||
return __taxonomy__
|
||||
@@ -30,3 +32,27 @@ func SetSelectedTaxonomy(taxonomy string) {
|
||||
func SetAlternativeNamesSelected(alt bool) {
|
||||
__alternative_name__ = alt
|
||||
}
|
||||
|
||||
func SetFailOnTaxonomy(fail bool) {
|
||||
__fail_on_taxonomy__ = fail
|
||||
}
|
||||
|
||||
func SetUpdateTaxid(update bool) {
|
||||
__update_taxid__ = update
|
||||
}
|
||||
|
||||
func FailOnTaxonomyPtr() *bool {
|
||||
return &__fail_on_taxonomy__
|
||||
}
|
||||
|
||||
func UpdateTaxidPtr() *bool {
|
||||
return &__update_taxid__
|
||||
}
|
||||
|
||||
func FailOnTaxonomy() bool {
|
||||
return __fail_on_taxonomy__
|
||||
}
|
||||
|
||||
func UpdateTaxid() bool {
|
||||
return __update_taxid__
|
||||
}
|
||||
|
||||
@@ -9,12 +9,11 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/buger/jsonparser"
|
||||
)
|
||||
|
||||
func _parse_json_map_string(str []byte, sequence *obiseq.BioSequence) (map[string]string, error) {
|
||||
func _parse_json_map_string(str []byte) (map[string]string, error) {
|
||||
values := make(map[string]string)
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -26,7 +25,7 @@ func _parse_json_map_string(str []byte, sequence *obiseq.BioSequence) (map[strin
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_map_int(str []byte, sequence *obiseq.BioSequence) (map[string]int, error) {
|
||||
func _parse_json_map_int(str []byte) (map[string]int, error) {
|
||||
values := make(map[string]int)
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -42,7 +41,7 @@ func _parse_json_map_int(str []byte, sequence *obiseq.BioSequence) (map[string]i
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_map_float(str []byte, sequence *obiseq.BioSequence) (map[string]float64, error) {
|
||||
func _parse_json_map_float(str []byte) (map[string]float64, error) {
|
||||
values := make(map[string]float64)
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -58,7 +57,7 @@ func _parse_json_map_float(str []byte, sequence *obiseq.BioSequence) (map[string
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_map_bool(str []byte, sequence *obiseq.BioSequence) (map[string]bool, error) {
|
||||
func _parse_json_map_bool(str []byte) (map[string]bool, error) {
|
||||
values := make(map[string]bool)
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -74,7 +73,7 @@ func _parse_json_map_bool(str []byte, sequence *obiseq.BioSequence) (map[string]
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_map_interface(str []byte, sequence *obiseq.BioSequence) (map[string]interface{}, error) {
|
||||
func _parse_json_map_interface(str []byte) (map[string]interface{}, error) {
|
||||
values := make(map[string]interface{})
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -101,7 +100,7 @@ func _parse_json_map_interface(str []byte, sequence *obiseq.BioSequence) (map[st
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_array_string(str []byte, sequence *obiseq.BioSequence) ([]string, error) {
|
||||
func _parse_json_array_string(str []byte) ([]string, error) {
|
||||
values := make([]string, 0)
|
||||
jsonparser.ArrayEach(str,
|
||||
func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
|
||||
@@ -163,7 +162,7 @@ func _parse_json_array_bool(str []byte, sequence *obiseq.BioSequence) ([]bool, e
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_array_interface(str []byte, sequence *obiseq.BioSequence) ([]interface{}, error) {
|
||||
func _parse_json_array_interface(str []byte) ([]interface{}, error) {
|
||||
values := make([]interface{}, 0)
|
||||
jsonparser.ArrayEach(str,
|
||||
func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
|
||||
@@ -201,8 +200,6 @@ func _parse_json_array_interface(str []byte, sequence *obiseq.BioSequence) ([]in
|
||||
}
|
||||
|
||||
func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
taxonomy := obitax.DefaultTaxonomy()
|
||||
|
||||
annotations := sequence.Annotations()
|
||||
start := -1
|
||||
stop := -1
|
||||
@@ -264,14 +261,14 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
sequence.SetCount(int(count))
|
||||
|
||||
case skey == "obiclean_weight":
|
||||
weight, err := _parse_json_map_int(value, sequence)
|
||||
weight, err := _parse_json_map_int(value)
|
||||
if err != nil {
|
||||
log.Fatalf("%s: Cannot parse obiclean weight %s", sequence.Id(), string(value))
|
||||
}
|
||||
annotations[skey] = weight
|
||||
|
||||
case skey == "obiclean_status":
|
||||
status, err := _parse_json_map_string(value, sequence)
|
||||
status, err := _parse_json_map_string(value)
|
||||
if err != nil {
|
||||
log.Fatalf("%s: Cannot parse obiclean status %s", sequence.Id(), string(value))
|
||||
}
|
||||
@@ -279,7 +276,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
|
||||
case strings.HasPrefix(skey, "merged_"):
|
||||
if dataType == jsonparser.Object {
|
||||
data, err := _parse_json_map_int(value, sequence)
|
||||
data, err := _parse_json_map_int(value)
|
||||
if err != nil {
|
||||
log.Fatalf("%s: Cannot parse merged slot %s: %v", sequence.Id(), skey, err)
|
||||
} else {
|
||||
@@ -291,13 +288,8 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
|
||||
case skey == "taxid":
|
||||
if dataType == jsonparser.Number || dataType == jsonparser.String {
|
||||
taxid := obiutils.UnsafeString(value)
|
||||
taxon := taxonomy.Taxon(taxid)
|
||||
if taxon != nil {
|
||||
sequence.SetTaxon(taxon)
|
||||
} else {
|
||||
sequence.SetTaxid(string(value))
|
||||
}
|
||||
taxid := string(value)
|
||||
sequence.SetTaxid(taxid)
|
||||
} else {
|
||||
log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value))
|
||||
}
|
||||
@@ -306,15 +298,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
if dataType == jsonparser.Number || dataType == jsonparser.String {
|
||||
rank, _ := obiutils.SplitInTwo(skey, '_')
|
||||
|
||||
taxid := obiutils.UnsafeString(value)
|
||||
taxon := taxonomy.Taxon(taxid)
|
||||
|
||||
if taxon != nil {
|
||||
taxid = taxon.String()
|
||||
} else {
|
||||
taxid = string(value)
|
||||
}
|
||||
|
||||
taxid := string(value)
|
||||
sequence.SetTaxid(taxid, rank)
|
||||
} else {
|
||||
log.Fatalf("%s: Cannot parse taxid %s", sequence.Id(), string(value))
|
||||
@@ -332,9 +316,9 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
annotations[skey], err = strconv.ParseFloat(obiutils.UnsafeString(value), 64)
|
||||
}
|
||||
case jsonparser.Array:
|
||||
annotations[skey], err = _parse_json_array_interface(value, sequence)
|
||||
annotations[skey], err = _parse_json_array_interface(value)
|
||||
case jsonparser.Object:
|
||||
annotations[skey], err = _parse_json_map_interface(value, sequence)
|
||||
annotations[skey], err = _parse_json_map_interface(value)
|
||||
case jsonparser.Boolean:
|
||||
annotations[skey], err = jsonparser.ParseBoolean(value)
|
||||
case jsonparser.Null:
|
||||
|
||||
@@ -72,7 +72,7 @@ func OBIMimeTypeGuesser(stream io.Reader) (*mimetype.MIME, io.Reader, error) {
|
||||
}
|
||||
|
||||
fastqDetector := func(raw []byte, limit uint32) bool {
|
||||
ok, err := regexp.Match("^@[^ ].*\n[^ ]+\n\\+", raw)
|
||||
ok, err := regexp.Match("^@[^ ].*\n[A-Za-z.-]+", raw)
|
||||
return ok && err == nil
|
||||
}
|
||||
|
||||
|
||||
18
pkg/obiiter/extract_taxonomy.go
Normal file
18
pkg/obiiter/extract_taxonomy.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package obiiter
|
||||
|
||||
import "git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
|
||||
func (iterator *IBioSequence) ExtractTaxonomy() (taxonomy *obitax.Taxonomy, err error) {
|
||||
|
||||
for iterator.Next() {
|
||||
slice := iterator.Get().Slice()
|
||||
|
||||
taxonomy, err = slice.ExtractTaxonomy(taxonomy)
|
||||
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
@@ -19,7 +19,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
|
||||
newiter.WaitAndClose()
|
||||
}()
|
||||
|
||||
f := func(iterator IBioSequence, id int) {
|
||||
f := func(iterator IBioSequence) {
|
||||
source := ""
|
||||
for iterator.Next() {
|
||||
news := obiseq.MakeBioSequenceSlice()
|
||||
@@ -66,9 +66,9 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
|
||||
}
|
||||
|
||||
for i := 1; i < nworkers; i++ {
|
||||
go f(iterator.Split(), i)
|
||||
go f(iterator.Split())
|
||||
}
|
||||
go f(iterator, 0)
|
||||
go f(iterator)
|
||||
|
||||
return newiter.SortBatches().Rebatch(size)
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package obikmer
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
var __single_base_code__ = []byte{0,
|
||||
@@ -131,33 +132,39 @@ func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obise
|
||||
maxshift := 0
|
||||
maxcount := 0
|
||||
maxscore := -1.0
|
||||
maxrelscore := -1.0
|
||||
|
||||
for shift, count := range *shifts {
|
||||
delete((*shifts), shift)
|
||||
score := float64(count)
|
||||
if relscore {
|
||||
over := -shift
|
||||
switch {
|
||||
case shift > 0:
|
||||
over += lindex
|
||||
case shift < 0:
|
||||
over = seq.Len() - over
|
||||
default:
|
||||
over = min(lindex, seq.Len())
|
||||
}
|
||||
score = score / float64(over-3)
|
||||
selectscore := float64(count)
|
||||
relativescore := float64(count)
|
||||
over := -shift
|
||||
switch {
|
||||
case shift > 0:
|
||||
over += lindex
|
||||
case shift < 0:
|
||||
over = seq.Len() - over
|
||||
default:
|
||||
over = min(lindex, seq.Len())
|
||||
}
|
||||
if score > maxscore {
|
||||
relativescore = relativescore / float64(over-3)
|
||||
if relscore {
|
||||
selectscore = relativescore
|
||||
}
|
||||
|
||||
if selectscore > maxscore {
|
||||
maxshift = shift
|
||||
maxcount = count
|
||||
maxscore = score
|
||||
maxscore = selectscore
|
||||
maxrelscore = relativescore
|
||||
} else {
|
||||
if score == maxscore && shift < maxshift {
|
||||
if selectscore == maxscore && obiutils.Abs(shift) < obiutils.Abs(maxshift) {
|
||||
maxshift = shift
|
||||
maxcount = count
|
||||
maxrelscore = relativescore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return maxshift, maxcount, maxscore
|
||||
return maxshift, maxcount, maxrelscore
|
||||
}
|
||||
|
||||
@@ -4,4 +4,5 @@ import lua "github.com/yuin/gopher-lua"
|
||||
|
||||
func RegisterObilib(luaState *lua.LState) {
|
||||
RegisterObiSeq(luaState)
|
||||
RegisterObiTaxonomy(luaState)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
package obilua
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
lua "github.com/yuin/gopher-lua"
|
||||
)
|
||||
|
||||
@@ -16,6 +18,7 @@ func registerBioSequenceType(luaState *lua.LState) {
|
||||
bioSequenceType := luaState.NewTypeMetatable(luaBioSequenceTypeName)
|
||||
luaState.SetGlobal(luaBioSequenceTypeName, bioSequenceType)
|
||||
luaState.SetField(bioSequenceType, "new", luaState.NewFunction(newObiSeq))
|
||||
luaState.SetField(bioSequenceType, "nil", obiseq2Lua(luaState, nil))
|
||||
|
||||
luaState.SetField(bioSequenceType, "__index",
|
||||
luaState.SetFuncs(luaState.NewTable(),
|
||||
@@ -53,6 +56,7 @@ var bioSequenceMethods = map[string]lua.LGFunction{
|
||||
"definition": bioSequenceGetSetDefinition,
|
||||
"count": bioSequenceGetSetCount,
|
||||
"taxid": bioSequenceGetSetTaxid,
|
||||
"taxon": bioSequenceGetSetTaxon,
|
||||
"attribute": bioSequenceGetSetAttribute,
|
||||
"len": bioSequenceGetLength,
|
||||
"has_sequence": bioSequenceHasSequence,
|
||||
@@ -62,6 +66,9 @@ var bioSequenceMethods = map[string]lua.LGFunction{
|
||||
"md5_string": bioSequenceGetMD5String,
|
||||
"subsequence": bioSequenceGetSubsequence,
|
||||
"reverse_complement": bioSequenceGetRevcomp,
|
||||
"fasta": bioSequenceGetFasta,
|
||||
"fastq": bioSequenceGetFastq,
|
||||
"string": bioSequenceAsString,
|
||||
}
|
||||
|
||||
// checkBioSequence checks if the first argument in the Lua stack is a *obiseq.BioSequence.
|
||||
@@ -254,3 +261,88 @@ func bioSequenceGetRevcomp(luaState *lua.LState) int {
|
||||
luaState.Push(obiseq2Lua(luaState, revcomp))
|
||||
return 1
|
||||
}
|
||||
|
||||
func bioSequenceGetSetTaxon(luaState *lua.LState) int {
|
||||
s := checkBioSequence(luaState)
|
||||
|
||||
if luaState.GetTop() > 1 {
|
||||
taxon := checkTaxon(luaState, 2)
|
||||
|
||||
s.SetTaxon(taxon)
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
taxon := s.Taxon(obitax.DefaultTaxonomy())
|
||||
luaState.Push(taxon2Lua(luaState, taxon))
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
func bioSequenceGetFasta(luaState *lua.LState) int {
|
||||
s := checkBioSequence(luaState)
|
||||
|
||||
formater := obiformats.FormatFastSeqJsonHeader
|
||||
|
||||
if luaState.GetTop() > 1 {
|
||||
format := luaState.CheckString(2)
|
||||
switch format {
|
||||
case "json":
|
||||
formater = obiformats.FormatFastSeqJsonHeader
|
||||
case "obi":
|
||||
formater = obiformats.FormatFastSeqOBIHeader
|
||||
}
|
||||
}
|
||||
|
||||
txt := obiformats.FormatFasta(s, formater)
|
||||
|
||||
luaState.Push(lua.LString(txt))
|
||||
return 1
|
||||
}
|
||||
|
||||
func bioSequenceGetFastq(luaState *lua.LState) int {
|
||||
s := checkBioSequence(luaState)
|
||||
|
||||
formater := obiformats.FormatFastSeqJsonHeader
|
||||
|
||||
if luaState.GetTop() > 1 {
|
||||
format := luaState.CheckString(2)
|
||||
switch format {
|
||||
case "json":
|
||||
formater = obiformats.FormatFastSeqJsonHeader
|
||||
case "obi":
|
||||
formater = obiformats.FormatFastSeqOBIHeader
|
||||
}
|
||||
}
|
||||
|
||||
txt := obiformats.FormatFastq(s, formater)
|
||||
|
||||
luaState.Push(lua.LString(txt))
|
||||
return 1
|
||||
}
|
||||
|
||||
func bioSequenceAsString(luaState *lua.LState) int {
|
||||
s := checkBioSequence(luaState)
|
||||
|
||||
formater := obiformats.FormatFastSeqJsonHeader
|
||||
format := obiformats.FormatFasta
|
||||
|
||||
if s.HasQualities() {
|
||||
format = obiformats.FormatFastq
|
||||
}
|
||||
|
||||
if luaState.GetTop() > 1 {
|
||||
format := luaState.CheckString(2)
|
||||
switch format {
|
||||
case "json":
|
||||
formater = obiformats.FormatFastSeqJsonHeader
|
||||
case "obi":
|
||||
formater = obiformats.FormatFastSeqOBIHeader
|
||||
}
|
||||
}
|
||||
|
||||
txt := format(s, formater)
|
||||
|
||||
luaState.Push(lua.LString(txt))
|
||||
return 1
|
||||
}
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
package obilua
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
lua "github.com/yuin/gopher-lua"
|
||||
)
|
||||
@@ -11,6 +14,7 @@ func registerBioSequenceSliceType(luaState *lua.LState) {
|
||||
bioSequenceSliceType := luaState.NewTypeMetatable(luaBioSequenceSliceTypeName)
|
||||
luaState.SetGlobal(luaBioSequenceSliceTypeName, bioSequenceSliceType)
|
||||
luaState.SetField(bioSequenceSliceType, "new", luaState.NewFunction(newObiSeqSlice))
|
||||
luaState.SetField(bioSequenceSliceType, "nil", obiseqslice2Lua(luaState, nil))
|
||||
|
||||
luaState.SetField(bioSequenceSliceType, "__index",
|
||||
luaState.SetFuncs(luaState.NewTable(),
|
||||
@@ -37,6 +41,9 @@ var bioSequenceSliceMethods = map[string]lua.LGFunction{
|
||||
"pop": bioSequenceSlicePop,
|
||||
"sequence": bioSequenceSliceGetSetSequence,
|
||||
"len": bioSequenceSliceGetLength,
|
||||
"fasta": bioSequenceSliceGetFasta,
|
||||
"fastq": bioSequenceSliceGetFastq,
|
||||
"string": bioSequenceSliceAsString,
|
||||
}
|
||||
|
||||
func checkBioSequenceSlice(L *lua.LState) *obiseq.BioSequenceSlice {
|
||||
@@ -105,3 +112,96 @@ func bioSequenceSlicePop(luaState *lua.LState) int {
|
||||
return 1
|
||||
|
||||
}
|
||||
|
||||
func bioSequenceSliceGetFasta(luaState *lua.LState) int {
|
||||
s := checkBioSequenceSlice(luaState)
|
||||
|
||||
formater := obiformats.FormatFastSeqJsonHeader
|
||||
|
||||
if luaState.GetTop() > 1 {
|
||||
format := luaState.CheckString(2)
|
||||
switch format {
|
||||
case "json":
|
||||
formater = obiformats.FormatFastSeqJsonHeader
|
||||
case "obi":
|
||||
formater = obiformats.FormatFastSeqOBIHeader
|
||||
}
|
||||
}
|
||||
|
||||
txts := make([]string, len(*s))
|
||||
|
||||
for i, seq := range *s {
|
||||
txts[i] = obiformats.FormatFasta(seq, formater)
|
||||
}
|
||||
|
||||
txt := strings.Join(txts, "\n")
|
||||
|
||||
luaState.Push(lua.LString(txt))
|
||||
return 1
|
||||
}
|
||||
|
||||
func bioSequenceSliceGetFastq(luaState *lua.LState) int {
|
||||
s := checkBioSequenceSlice(luaState)
|
||||
|
||||
formater := obiformats.FormatFastSeqJsonHeader
|
||||
|
||||
if luaState.GetTop() > 1 {
|
||||
format := luaState.CheckString(2)
|
||||
switch format {
|
||||
case "json":
|
||||
formater = obiformats.FormatFastSeqJsonHeader
|
||||
case "obi":
|
||||
formater = obiformats.FormatFastSeqOBIHeader
|
||||
}
|
||||
}
|
||||
|
||||
txts := make([]string, len(*s))
|
||||
|
||||
for i, seq := range *s {
|
||||
txts[i] = obiformats.FormatFastq(seq, formater)
|
||||
}
|
||||
|
||||
txt := strings.Join(txts, "\n")
|
||||
|
||||
luaState.Push(lua.LString(txt))
|
||||
return 1
|
||||
}
|
||||
|
||||
func bioSequenceSliceAsString(luaState *lua.LState) int {
|
||||
s := checkBioSequenceSlice(luaState)
|
||||
|
||||
formater := obiformats.FormatFastSeqJsonHeader
|
||||
|
||||
if luaState.GetTop() > 1 {
|
||||
format := luaState.CheckString(2)
|
||||
switch format {
|
||||
case "json":
|
||||
formater = obiformats.FormatFastSeqJsonHeader
|
||||
case "obi":
|
||||
formater = obiformats.FormatFastSeqOBIHeader
|
||||
}
|
||||
}
|
||||
|
||||
txts := make([]string, len(*s))
|
||||
|
||||
format := obiformats.FormatFasta
|
||||
|
||||
allQual := true
|
||||
|
||||
for _, s := range *s {
|
||||
allQual = allQual && s.HasQualities()
|
||||
}
|
||||
|
||||
if allQual {
|
||||
format = obiformats.FormatFastq
|
||||
}
|
||||
|
||||
for i, seq := range *s {
|
||||
txts[i] = format(seq, formater)
|
||||
}
|
||||
|
||||
txt := strings.Join(txts, "\n")
|
||||
|
||||
luaState.Push(lua.LString(txt))
|
||||
return 1
|
||||
}
|
||||
|
||||
139
pkg/obilua/obitaxon.go
Normal file
139
pkg/obilua/obitaxon.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package obilua
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
lua "github.com/yuin/gopher-lua"
|
||||
)
|
||||
|
||||
const luaTaxonTypeName = "Taxon"
|
||||
|
||||
func registerTaxonType(luaState *lua.LState) {
|
||||
taxonType := luaState.NewTypeMetatable(luaTaxonTypeName)
|
||||
luaState.SetGlobal(luaTaxonTypeName, taxonType)
|
||||
luaState.SetField(taxonType, "new", luaState.NewFunction(newTaxon))
|
||||
luaState.SetField(taxonType, "nil", taxonomy2Lua(luaState, nil))
|
||||
|
||||
luaState.SetField(taxonType, "__index",
|
||||
luaState.SetFuncs(luaState.NewTable(),
|
||||
taxonMethods))
|
||||
}
|
||||
|
||||
func taxon2Lua(interpreter *lua.LState,
|
||||
taxon *obitax.Taxon) lua.LValue {
|
||||
ud := interpreter.NewUserData()
|
||||
ud.Value = taxon
|
||||
interpreter.SetMetatable(ud, interpreter.GetTypeMetatable(luaTaxonTypeName))
|
||||
|
||||
return ud
|
||||
}
|
||||
|
||||
func newTaxon(luaState *lua.LState) int {
|
||||
taxonomy := checkTaxonomy(luaState)
|
||||
taxid := luaState.CheckString(2)
|
||||
parent := luaState.CheckString(3)
|
||||
sname := luaState.CheckString(4)
|
||||
rank := luaState.CheckString(5)
|
||||
|
||||
isroot := false
|
||||
|
||||
if luaState.GetTop() > 5 {
|
||||
isroot = luaState.CheckBool(6)
|
||||
}
|
||||
|
||||
taxon, err := taxonomy.AddTaxon(taxid, parent, rank, isroot, false)
|
||||
|
||||
if err != nil {
|
||||
luaState.RaiseError("(%v,%v,%v) : Error on taxon creation: %v", taxid, parent, sname, err)
|
||||
return 0
|
||||
}
|
||||
|
||||
taxon.SetName(sname, "scientific name")
|
||||
|
||||
luaState.Push(taxon2Lua(luaState, taxon))
|
||||
return 1
|
||||
}
|
||||
|
||||
var taxonMethods = map[string]lua.LGFunction{
|
||||
"string": taxonAsString,
|
||||
"scientific_name": taxonGetSetScientificName,
|
||||
"parent": taxonGetParent,
|
||||
"taxon_at_rank": taxGetTaxonAtRank,
|
||||
"species": taxonGetSpecies,
|
||||
"genus": taxonGetGenus,
|
||||
"family": taxonGetFamily,
|
||||
}
|
||||
|
||||
func checkTaxon(L *lua.LState, i int) *obitax.Taxon {
|
||||
ud := L.CheckUserData(i)
|
||||
if v, ok := ud.Value.(*obitax.Taxon); ok {
|
||||
return v
|
||||
}
|
||||
L.ArgError(i, "obitax.Taxon expected")
|
||||
return nil
|
||||
}
|
||||
|
||||
func taxonAsString(luaState *lua.LState) int {
|
||||
taxon := checkTaxon(luaState, 1)
|
||||
luaState.Push(lua.LString(taxon.String()))
|
||||
return 1
|
||||
}
|
||||
|
||||
func taxonGetSetScientificName(luaState *lua.LState) int {
|
||||
taxon := checkTaxon(luaState, 1)
|
||||
|
||||
if luaState.GetTop() > 1 {
|
||||
sname := luaState.CheckString(2)
|
||||
taxon.SetName(sname, "scientific name")
|
||||
return 0
|
||||
}
|
||||
|
||||
luaState.Push(lua.LString(taxon.ScientificName()))
|
||||
return 1
|
||||
}
|
||||
|
||||
func taxonGetParent(luaState *lua.LState) int {
|
||||
taxon := checkTaxon(luaState, 1)
|
||||
|
||||
parent := taxon.Parent()
|
||||
luaState.Push(taxon2Lua(luaState, parent))
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
func taxonGetSpecies(luaState *lua.LState) int {
|
||||
taxon := checkTaxon(luaState, 1)
|
||||
|
||||
species := taxon.Species()
|
||||
luaState.Push(taxon2Lua(luaState, species))
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
func taxonGetGenus(luaState *lua.LState) int {
|
||||
taxon := checkTaxon(luaState, 1)
|
||||
|
||||
genus := taxon.Genus()
|
||||
luaState.Push(taxon2Lua(luaState, genus))
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
func taxonGetFamily(luaState *lua.LState) int {
|
||||
taxon := checkTaxon(luaState, 1)
|
||||
|
||||
family := taxon.Family()
|
||||
luaState.Push(taxon2Lua(luaState, family))
|
||||
|
||||
return 1
|
||||
}
|
||||
|
||||
func taxGetTaxonAtRank(luaState *lua.LState) int {
|
||||
taxon := checkTaxon(luaState, 1)
|
||||
rank := luaState.CheckString(2)
|
||||
|
||||
taxonAt := taxon.TaxonAtRank(rank)
|
||||
|
||||
luaState.Push(taxon2Lua(luaState, taxonAt))
|
||||
|
||||
return 1
|
||||
}
|
||||
116
pkg/obilua/obitaxonomy.go
Normal file
116
pkg/obilua/obitaxonomy.go
Normal file
@@ -0,0 +1,116 @@
|
||||
package obilua
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
lua "github.com/yuin/gopher-lua"
|
||||
)
|
||||
|
||||
func RegisterObiTaxonomy(luaState *lua.LState) {
|
||||
registerTaxonomyType(luaState)
|
||||
registerTaxonType(luaState)
|
||||
}
|
||||
|
||||
const luaTaxonomyTypeName = "Taxonomy"
|
||||
|
||||
func registerTaxonomyType(luaState *lua.LState) {
|
||||
taxonomyType := luaState.NewTypeMetatable(luaTaxonomyTypeName)
|
||||
luaState.SetGlobal(luaTaxonomyTypeName, taxonomyType)
|
||||
luaState.SetField(taxonomyType, "new", luaState.NewFunction(newTaxonomy))
|
||||
luaState.SetField(taxonomyType, "default", luaState.NewFunction(defaultTaxonomy))
|
||||
luaState.SetField(taxonomyType, "has_default", luaState.NewFunction(hasDefaultTaxonomy))
|
||||
luaState.SetField(taxonomyType, "nil", taxon2Lua(luaState, nil))
|
||||
luaState.SetField(taxonomyType, "__index",
|
||||
luaState.SetFuncs(luaState.NewTable(),
|
||||
taxonomyMethods))
|
||||
}
|
||||
|
||||
func taxonomy2Lua(interpreter *lua.LState,
|
||||
taxonomy *obitax.Taxonomy) lua.LValue {
|
||||
ud := interpreter.NewUserData()
|
||||
ud.Value = taxonomy
|
||||
interpreter.SetMetatable(ud, interpreter.GetTypeMetatable(luaTaxonomyTypeName))
|
||||
|
||||
return ud
|
||||
}
|
||||
|
||||
func newTaxonomy(luaState *lua.LState) int {
|
||||
name := luaState.CheckString(1)
|
||||
code := luaState.CheckString(2)
|
||||
|
||||
charset := obiutils.AsciiAlphaNumSet
|
||||
if luaState.GetTop() > 2 {
|
||||
charset = obiutils.AsciiSetFromString(luaState.CheckString(3))
|
||||
}
|
||||
|
||||
taxonomy := obitax.NewTaxonomy(name, code, charset)
|
||||
|
||||
luaState.Push(taxonomy2Lua(luaState, taxonomy))
|
||||
return 1
|
||||
}
|
||||
|
||||
func defaultTaxonomy(luaState *lua.LState) int {
|
||||
taxonomy := obitax.DefaultTaxonomy()
|
||||
|
||||
if taxonomy == nil {
|
||||
luaState.RaiseError("No default taxonomy")
|
||||
return 0
|
||||
}
|
||||
|
||||
luaState.Push(taxonomy2Lua(luaState, taxonomy))
|
||||
return 1
|
||||
}
|
||||
|
||||
func hasDefaultTaxonomy(luaState *lua.LState) int {
|
||||
taxonomy := obitax.DefaultTaxonomy()
|
||||
|
||||
luaState.Push(lua.LBool(taxonomy != nil))
|
||||
return 1
|
||||
}
|
||||
|
||||
var taxonomyMethods = map[string]lua.LGFunction{
|
||||
"name": taxonomyGetName,
|
||||
"code": taxonomyGetCode,
|
||||
"taxon": taxonomyGetTaxon,
|
||||
}
|
||||
|
||||
func checkTaxonomy(L *lua.LState) *obitax.Taxonomy {
|
||||
ud := L.CheckUserData(1)
|
||||
if v, ok := ud.Value.(*obitax.Taxonomy); ok {
|
||||
return v
|
||||
}
|
||||
L.ArgError(1, "obitax.Taxonomy expected")
|
||||
return nil
|
||||
}
|
||||
|
||||
func taxonomyGetName(luaState *lua.LState) int {
|
||||
taxo := checkTaxonomy(luaState)
|
||||
luaState.Push(lua.LString(taxo.Name()))
|
||||
return 1
|
||||
}
|
||||
|
||||
func taxonomyGetCode(luaState *lua.LState) int {
|
||||
taxo := checkTaxonomy(luaState)
|
||||
luaState.Push(lua.LString(taxo.Code()))
|
||||
return 1
|
||||
}
|
||||
|
||||
func taxonomyGetTaxon(luaState *lua.LState) int {
|
||||
taxo := checkTaxonomy(luaState)
|
||||
taxid := luaState.CheckString(2)
|
||||
taxon, isAlias, err := taxo.Taxon(taxid)
|
||||
|
||||
if err != nil {
|
||||
luaState.RaiseError("%s : Error on taxon taxon: %v", taxid, err)
|
||||
return 0
|
||||
}
|
||||
|
||||
if isAlias && obidefault.FailOnTaxonomy() {
|
||||
luaState.RaiseError("%s : Taxon is an alias of %s", taxid, taxon.String())
|
||||
return 0
|
||||
}
|
||||
|
||||
luaState.Push(taxon2Lua(luaState, taxon))
|
||||
return 1
|
||||
}
|
||||
@@ -66,10 +66,6 @@ func GenerateOptionParser(optionset ...func(*getoptions.GetOpt)) ArgumentParser
|
||||
options.GetEnv("OBISOLEXA"),
|
||||
options.Description("Decodes quality string according to the Solexa specification."))
|
||||
|
||||
options.BoolVar(obidefault.CompressedPtr(), "compressed", obidefault.CompressOutput(),
|
||||
options.Alias("Z"),
|
||||
options.Description("Compress all the result using gzip"))
|
||||
|
||||
for _, o := range optionset {
|
||||
o(options)
|
||||
}
|
||||
@@ -181,6 +177,15 @@ func LoadTaxonomyOptionSet(options *getoptions.GetOpt, required, alternatiive bo
|
||||
options.Alias("a"),
|
||||
options.Description("Enable the search on all alternative names and not only scientific names."))
|
||||
}
|
||||
|
||||
options.BoolVar(obidefault.FailOnTaxonomyPtr(), "fail-on-taxonomy",
|
||||
obidefault.FailOnTaxonomy(),
|
||||
options.Description("Make obitools failing on error if a used taxid is not a currently valid one"),
|
||||
)
|
||||
|
||||
options.BoolVar(obidefault.UpdateTaxidPtr(), "update-taxid", obidefault.UpdateTaxid(),
|
||||
options.Description("Make obitools automatically updating the taxid that are declared merged to a newest one."),
|
||||
)
|
||||
}
|
||||
|
||||
// CLIIsDebugMode returns whether the CLI is in debug mode.
|
||||
|
||||
@@ -8,8 +8,8 @@ import (
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
|
||||
var _Commit = "c50a0f4"
|
||||
var _Version = "Release 4.2.0"
|
||||
var _Commit = "7b23314"
|
||||
var _Version = "Release 4.4.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
//
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package obiseq
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
"golang.org/x/exp/slices"
|
||||
@@ -179,3 +180,18 @@ func (s *BioSequenceSlice) SortOnLength(reverse bool) {
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func (s *BioSequenceSlice) ExtractTaxonomy(taxonomy *obitax.Taxonomy) (*obitax.Taxonomy, error) {
|
||||
var err error
|
||||
|
||||
for _, s := range *s {
|
||||
taxonomy, err = taxonomy.InsertPathString(s.Path())
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
||||
|
||||
@@ -196,6 +196,16 @@ func IsShorterOrEqualTo(length int) SequencePredicate {
|
||||
return f
|
||||
}
|
||||
|
||||
func OccurInAtleast(sample string, n int) SequencePredicate {
|
||||
desc := MakeStatsOnDescription(sample)
|
||||
f := func(sequence *BioSequence) bool {
|
||||
stats := sequence.StatsOn(desc, "NA")
|
||||
return len(stats) >= n
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func IsSequenceMatch(pattern string) SequencePredicate {
|
||||
pat, err := regexp.Compile("(?i)" + pattern)
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ func TaxonomyClassifier(taxonomicRank string,
|
||||
if taxon != nil {
|
||||
ttaxon := taxon.TaxonAtRank(taxonomicRank)
|
||||
if abortOnMissing && ttaxon == nil {
|
||||
log.Fatalf("Taxon at rank %s not found in taxonomy for taxid %d", taxonomicRank, taxon.String())
|
||||
log.Fatalf("Taxon at rank %s not found in taxonomy for taxid %s", taxonomicRank, taxon.String())
|
||||
}
|
||||
} else {
|
||||
if abortOnMissing {
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"math"
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
@@ -15,13 +16,20 @@ func (sequence *BioSequence) TaxonomicDistribution(taxonomy *obitax.Taxonomy) ma
|
||||
taxonomy = taxonomy.OrDefault(true)
|
||||
|
||||
for taxid, v := range taxids {
|
||||
t := taxonomy.Taxon(taxid)
|
||||
if t == nil {
|
||||
t, isAlias, err := taxonomy.Taxon(taxid)
|
||||
if err != nil {
|
||||
log.Fatalf(
|
||||
"On sequence %s taxid %s is not defined in taxonomy: %s",
|
||||
"On sequence %s taxid %s is not defined in taxonomy: %s (%v)",
|
||||
sequence.Id(),
|
||||
taxid,
|
||||
taxonomy.Name())
|
||||
taxonomy.Name(),
|
||||
err,
|
||||
)
|
||||
}
|
||||
|
||||
if isAlias && obidefault.FailOnTaxonomy() {
|
||||
log.Fatalf("On sequence %s taxid %s is an alias on %s",
|
||||
sequence.Id(), taxid, t.String())
|
||||
}
|
||||
taxons[t.Node] = v
|
||||
}
|
||||
|
||||
@@ -5,7 +5,9 @@ import (
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
|
||||
@@ -14,7 +16,10 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
|
||||
if taxid == "NA" {
|
||||
return nil
|
||||
}
|
||||
return taxonomy.Taxon(taxid)
|
||||
|
||||
taxon, _, _ := taxonomy.Taxon(taxid)
|
||||
|
||||
return taxon
|
||||
}
|
||||
|
||||
// SetTaxid sets the taxid for the BioSequence.
|
||||
@@ -23,6 +28,9 @@ func (s *BioSequence) Taxon(taxonomy *obitax.Taxonomy) *obitax.Taxon {
|
||||
//
|
||||
// taxid - the taxid to set.
|
||||
func (s *BioSequence) SetTaxid(taxid string, rank ...string) {
|
||||
var err error
|
||||
var isAlias bool
|
||||
|
||||
if taxid == "" {
|
||||
taxid = "NA"
|
||||
} else {
|
||||
@@ -30,11 +38,38 @@ func (s *BioSequence) SetTaxid(taxid string, rank ...string) {
|
||||
taxon := (*obitax.Taxon)(nil)
|
||||
|
||||
if taxonomy != nil {
|
||||
taxon = taxonomy.Taxon(taxid)
|
||||
}
|
||||
taxon, isAlias, err = taxonomy.Taxon(taxid)
|
||||
|
||||
if err != nil {
|
||||
logger := log.Warnf
|
||||
if obidefault.FailOnTaxonomy() {
|
||||
logger = log.Fatalf
|
||||
}
|
||||
logger("%s: Taxid: %v is unknown from taxonomy (%v)",
|
||||
s.Id(), taxid, err)
|
||||
}
|
||||
|
||||
if isAlias {
|
||||
if obidefault.FailOnTaxonomy() {
|
||||
log.Fatalf("%s: Taxid: %v is an alias from taxonomy (%v) to %s",
|
||||
s.Id(), taxid, taxonomy.Name(), taxon.String())
|
||||
} else {
|
||||
if obidefault.UpdateTaxid() {
|
||||
log.Warnf("%s: Taxid: %v is updated to %s",
|
||||
s.Id(), taxid, taxon.String())
|
||||
taxid = taxon.String()
|
||||
} else {
|
||||
log.Warnf("%s: Taxid %v has to be updated to %s",
|
||||
s.Id(), taxid, taxon.String())
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
if taxon != nil {
|
||||
taxid = taxon.String()
|
||||
}
|
||||
}
|
||||
|
||||
if taxon != nil {
|
||||
taxid = taxon.String()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,14 +170,35 @@ func (sequence *BioSequence) SetFamily(taxonomy *obitax.Taxonomy) *obitax.Taxon
|
||||
return sequence.SetTaxonAtRank(taxonomy, "family")
|
||||
}
|
||||
|
||||
func (sequence *BioSequence) SetPath(taxonomy *obitax.Taxonomy) string {
|
||||
func (sequence *BioSequence) SetPath(taxonomy *obitax.Taxonomy) []string {
|
||||
taxon := sequence.Taxon(taxonomy)
|
||||
path := taxon.Path()
|
||||
spath := make([]string, path.Len())
|
||||
lpath := path.Len() - 1
|
||||
|
||||
tpath := path.String()
|
||||
sequence.SetAttribute("taxonomic_path", tpath)
|
||||
for i := lpath; i >= 0; i-- {
|
||||
spath[lpath-i] = path.Get(i).String(taxonomy.Code())
|
||||
}
|
||||
|
||||
return tpath
|
||||
sequence.SetAttribute("taxonomic_path", spath)
|
||||
|
||||
return spath
|
||||
}
|
||||
|
||||
func (sequence *BioSequence) Path() []string {
|
||||
path, ok := sequence.GetAttribute("taxonomic_path")
|
||||
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
|
||||
slice, err := obiutils.InterfaceToStringSlice(path)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("%s: taxonomic_path has the wrong type (%v)", sequence.Id(), err)
|
||||
}
|
||||
|
||||
return slice
|
||||
}
|
||||
|
||||
func (sequence *BioSequence) SetScientificName(taxonomy *obitax.Taxonomy) string {
|
||||
|
||||
@@ -25,7 +25,7 @@ func IsAValidTaxon(taxonomy *obitax.Taxonomy, withAutoCorrection ...bool) Sequen
|
||||
if autocorrection {
|
||||
sequence.SetTaxid(ttaxid)
|
||||
log.Printf(
|
||||
"Sequence %s : Taxid %d updated with %d",
|
||||
"Sequence %s : Taxid %s updated with %s",
|
||||
sequence.Id(),
|
||||
taxid,
|
||||
ttaxid,
|
||||
@@ -63,7 +63,12 @@ func IsSubCladeOfSlot(taxonomy *obitax.Taxonomy, key string) SequencePredicate {
|
||||
val, ok := sequence.GetStringAttribute(key)
|
||||
|
||||
if ok {
|
||||
parent := taxonomy.Taxon(val)
|
||||
parent, _, err := taxonomy.Taxon(val)
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("%s: %s is unkown from the taxonomy (%v)", sequence.Id(), val, err)
|
||||
}
|
||||
|
||||
taxon := sequence.Taxon(taxonomy)
|
||||
return parent != nil && taxon != nil && taxon.IsSubCladeOf(parent)
|
||||
}
|
||||
|
||||
@@ -1 +1,38 @@
|
||||
package obitax
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/TuftsBCB/io/newick"
|
||||
)
|
||||
|
||||
func (taxonomy *Taxonomy) Newick() string {
|
||||
if taxonomy == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
iterator := taxonomy.AsTaxonSet().Sort().Iterator()
|
||||
|
||||
nodes := make(map[*string]*newick.Tree, taxonomy.Len())
|
||||
trees := make([]*newick.Tree, 0)
|
||||
|
||||
for iterator.Next() {
|
||||
taxon := iterator.Get()
|
||||
tree := &newick.Tree{Label: taxon.String()}
|
||||
nodes[taxon.Node.id] = tree
|
||||
if parent, ok := nodes[taxon.Parent().Node.id]; ok {
|
||||
parent.Children = append(parent.Children, *tree)
|
||||
} else {
|
||||
trees = append(trees, tree)
|
||||
}
|
||||
}
|
||||
|
||||
rep := strings.Builder{}
|
||||
|
||||
for _, tree := range trees {
|
||||
rep.WriteString(tree.String())
|
||||
rep.WriteString("\n")
|
||||
}
|
||||
|
||||
return rep.String()
|
||||
}
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
package obitax
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
var __defaut_taxonomy__ *Taxonomy
|
||||
var __defaut_taxonomy_mutex__ sync.Mutex
|
||||
|
||||
func (taxonomy *Taxonomy) SetAsDefault() {
|
||||
log.Infof("Set as default taxonomy %s", taxonomy.Name())
|
||||
@@ -32,14 +35,18 @@ func DefaultTaxonomy() *Taxonomy {
|
||||
var err error
|
||||
if __defaut_taxonomy__ == nil {
|
||||
if obidefault.HasSelectedTaxonomy() {
|
||||
__defaut_taxonomy__, err = LoadTaxonomy(
|
||||
obidefault.SelectedTaxonomy(),
|
||||
!obidefault.AreAlternativeNamesSelected(),
|
||||
)
|
||||
__defaut_taxonomy_mutex__.Lock()
|
||||
defer __defaut_taxonomy_mutex__.Unlock()
|
||||
if __defaut_taxonomy__ == nil {
|
||||
__defaut_taxonomy__, err = LoadTaxonomy(
|
||||
obidefault.SelectedTaxonomy(),
|
||||
!obidefault.AreAlternativeNamesSelected(),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot load default taxonomy: %v", err)
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot load default taxonomy: %v", err)
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@ package obitax
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// ITaxon represents an iterator for traversing Taxon instances.
|
||||
@@ -195,7 +194,6 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
|
||||
|
||||
pushed := true
|
||||
|
||||
log.Warn(parents)
|
||||
for pushed {
|
||||
itaxo := taxo.Iterator()
|
||||
pushed = false
|
||||
@@ -218,9 +216,9 @@ func (taxon *Taxon) ISubTaxonomy() *ITaxon {
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) ISubTaxonomy(taxid string) *ITaxon {
|
||||
taxon := taxonomy.Taxon(taxid)
|
||||
taxon, _, err := taxonomy.Taxon(taxid)
|
||||
|
||||
if taxon == nil {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -91,7 +91,13 @@ func loadNameTable(reader io.Reader, taxonomy *Taxonomy, onlysn bool) int {
|
||||
|
||||
if !onlysn || classname == "scientific name" {
|
||||
n++
|
||||
taxonomy.Taxon(taxid).SetName(name, classname)
|
||||
taxon, _, err := taxonomy.Taxon(taxid)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("%s: is unknown from the taxonomy", taxid)
|
||||
}
|
||||
|
||||
taxon.SetName(name, classname)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,7 +131,7 @@ func loadMergedTable(reader io.Reader, taxonomy *Taxonomy) int {
|
||||
oldtaxid := strings.TrimSpace(record[0])
|
||||
newtaxid := strings.TrimSpace(record[1])
|
||||
|
||||
taxonomy.AddAlias(newtaxid, oldtaxid, false)
|
||||
taxonomy.AddAlias(oldtaxid, newtaxid, false)
|
||||
}
|
||||
|
||||
return n
|
||||
@@ -196,7 +202,11 @@ func LoadNCBITaxDump(directory string, onlysn bool) (*Taxonomy, error) {
|
||||
n = loadMergedTable(buffered, taxonomy)
|
||||
log.Printf("%d merged taxa read\n", n)
|
||||
|
||||
root := taxonomy.Taxon("1")
|
||||
root, _, err := taxonomy.Taxon("1")
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
|
||||
}
|
||||
taxonomy.SetRoot(root)
|
||||
|
||||
return taxonomy, nil
|
||||
|
||||
@@ -134,7 +134,12 @@ func LoadNCBITarTaxDump(path string, onlysn bool) (*Taxonomy, error) {
|
||||
n = loadMergedTable(buffered, taxonomy)
|
||||
log.Printf("%d merged taxa read\n", n)
|
||||
|
||||
root := taxonomy.Taxon("1")
|
||||
root, _, err := taxonomy.Taxon("1")
|
||||
|
||||
if err != nil {
|
||||
log.Fatal("cannot find the root taxon (1) in the NCBI tax dump")
|
||||
}
|
||||
|
||||
taxonomy.SetRoot(root)
|
||||
|
||||
return taxonomy, nil
|
||||
|
||||
1
pkg/obitax/newick_write.go
Normal file
1
pkg/obitax/newick_write.go
Normal file
@@ -0,0 +1 @@
|
||||
package obitax
|
||||
64
pkg/obitax/string_parser.go
Normal file
64
pkg/obitax/string_parser.go
Normal file
@@ -0,0 +1,64 @@
|
||||
package obitax
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// ParseTaxonString parses a string in the format "code:taxid [scientific name]@rank"
|
||||
// and returns the individual components. It handles extra whitespace around components.
|
||||
//
|
||||
// Parameters:
|
||||
// - taxonStr: The string to parse in the format "code:taxid [scientific name]@rank"
|
||||
//
|
||||
// Returns:
|
||||
// - code: The taxonomy code
|
||||
// - taxid: The taxon identifier
|
||||
// - scientificName: The scientific name (without brackets)
|
||||
// - rank: The rank
|
||||
// - error: An error if the string format is invalid
|
||||
func ParseTaxonString(taxonStr string) (code, taxid, scientificName, rank string, err error) {
|
||||
// Trim any leading/trailing whitespace from the entire string
|
||||
taxonStr = strings.TrimSpace(taxonStr)
|
||||
|
||||
// Split by '@' to separate rank
|
||||
parts := strings.Split(taxonStr, "@")
|
||||
if len(parts) > 2 {
|
||||
return "", "", "", "", errors.New("invalid format: multiple '@' characters found")
|
||||
}
|
||||
|
||||
mainPart := strings.TrimSpace(parts[0])
|
||||
if len(parts) == 2 {
|
||||
rank = strings.TrimSpace(parts[1])
|
||||
} else {
|
||||
rank = "no rank"
|
||||
}
|
||||
|
||||
// Find scientific name part (enclosed in square brackets)
|
||||
startBracket := strings.Index(mainPart, "[")
|
||||
endBracket := strings.LastIndex(mainPart, "]")
|
||||
|
||||
if startBracket == -1 || endBracket == -1 || startBracket > endBracket {
|
||||
return "", "", "", "", errors.New("invalid format: scientific name must be enclosed in square brackets")
|
||||
}
|
||||
|
||||
// Extract and clean scientific name
|
||||
scientificName = strings.TrimSpace(mainPart[startBracket+1 : endBracket])
|
||||
|
||||
// Process code:taxid part
|
||||
idPart := strings.TrimSpace(mainPart[:startBracket])
|
||||
idComponents := strings.Split(idPart, ":")
|
||||
|
||||
if len(idComponents) != 2 {
|
||||
return "", "", "", "", errors.New("invalid format: missing taxonomy code separator ':'")
|
||||
}
|
||||
|
||||
code = strings.TrimSpace(idComponents[0])
|
||||
taxid = strings.TrimSpace(idComponents[1])
|
||||
|
||||
if code == "" || taxid == "" || scientificName == "" {
|
||||
return "", "", "", "", errors.New("invalid format: code, taxid and scientific name cannot be empty")
|
||||
}
|
||||
|
||||
return code, taxid, scientificName, rank, nil
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
package obitax
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"iter"
|
||||
"regexp"
|
||||
|
||||
@@ -379,3 +380,29 @@ func (taxon *Taxon) SameAs(other *Taxon) bool {
|
||||
|
||||
return taxon.Taxonomy == other.Taxonomy && taxon.Node.id == other.Node.id
|
||||
}
|
||||
|
||||
func (taxon *Taxon) AddChild(child string, replace bool) (*Taxon, error) {
|
||||
if taxon == nil {
|
||||
return nil, errors.New("nil taxon")
|
||||
}
|
||||
|
||||
code, taxid, scientific_name, rank, err := ParseTaxonString(child)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if taxon.Taxonomy.code != code {
|
||||
return nil, errors.New("taxonomy code mismatch")
|
||||
}
|
||||
|
||||
newTaxon, err := taxon.Taxonomy.AddTaxon(taxid, *taxon.Node.id, rank, false, replace)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
newTaxon.SetName(scientific_name, "scientific name")
|
||||
|
||||
return newTaxon, nil
|
||||
}
|
||||
|
||||
@@ -12,7 +12,6 @@ import (
|
||||
"fmt"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
// Taxonomy represents a hierarchical classification of taxa.
|
||||
@@ -130,27 +129,30 @@ func (taxonomy *Taxonomy) TaxidString(id string) (string, error) {
|
||||
// Returns:
|
||||
// - A pointer to the Taxon instance associated with the provided taxid.
|
||||
// - If the taxid is unknown, the method will log a fatal error.
|
||||
func (taxonomy *Taxonomy) Taxon(taxid string) *Taxon {
|
||||
func (taxonomy *Taxonomy) Taxon(taxid string) (*Taxon, bool, error) {
|
||||
taxonomy = taxonomy.OrDefault(false)
|
||||
if taxonomy == nil {
|
||||
return nil
|
||||
return nil, false, errors.New("cannot extract taxon from nil taxonomy")
|
||||
}
|
||||
|
||||
id, err := taxonomy.Id(taxid)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Taxid %s: %v", taxid, err)
|
||||
return nil, false, fmt.Errorf("Taxid %s: %v", taxid, err)
|
||||
}
|
||||
|
||||
taxon := taxonomy.nodes.Get(id)
|
||||
isAlias := taxon.Node.id != id
|
||||
|
||||
if taxon == nil {
|
||||
log.Fatalf("Taxid %s is not part of the taxonomy %s",
|
||||
taxid,
|
||||
taxonomy.name)
|
||||
return nil,
|
||||
false,
|
||||
fmt.Errorf("Taxid %s is not part of the taxonomy %s",
|
||||
taxid,
|
||||
taxonomy.name)
|
||||
}
|
||||
|
||||
return taxon
|
||||
return taxon, isAlias, nil
|
||||
}
|
||||
|
||||
// AsTaxonSet returns the set of taxon nodes contained within the Taxonomy.
|
||||
@@ -353,3 +355,63 @@ func (taxonomy *Taxonomy) HasRoot() bool {
|
||||
taxonomy = taxonomy.OrDefault(false)
|
||||
return taxonomy != nil && taxonomy.root != nil
|
||||
}
|
||||
|
||||
func (taxonomy *Taxonomy) InsertPathString(path []string) (*Taxonomy, error) {
|
||||
if len(path) == 0 {
|
||||
return nil, errors.New("path is empty")
|
||||
}
|
||||
|
||||
code, taxid, scientific_name, rank, err := ParseTaxonString(path[0])
|
||||
|
||||
if taxonomy == nil {
|
||||
taxonomy = NewTaxonomy(code, code, obiutils.AsciiAlphaNumSet)
|
||||
}
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if taxonomy.Len() == 0 {
|
||||
|
||||
if code != taxonomy.code {
|
||||
return nil, fmt.Errorf("cannot insert taxon %s into taxonomy %s with code %s",
|
||||
path[0], taxonomy.name, taxonomy.code)
|
||||
}
|
||||
|
||||
root, err := taxonomy.AddTaxon(taxid, taxid, rank, true, true)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
root.SetName(scientific_name, "scientificName")
|
||||
}
|
||||
|
||||
var current *Taxon
|
||||
current, _, err = taxonomy.Taxon(taxid)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if !current.IsRoot() {
|
||||
return nil, errors.New("path does not start with a root node")
|
||||
}
|
||||
|
||||
for _, id := range path[1:] {
|
||||
taxon, _, err := taxonomy.Taxon(id)
|
||||
if err == nil {
|
||||
if !current.SameAs(taxon.Parent()) {
|
||||
return nil, errors.New("path is not consistent with the taxonomy, parent mismatch")
|
||||
}
|
||||
current = taxon
|
||||
} else {
|
||||
current, err = current.AddChild(id, false)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return taxonomy, nil
|
||||
}
|
||||
|
||||
@@ -151,7 +151,8 @@ func (set *TaxonSet) Alias(id *string, taxon *Taxon) {
|
||||
if original == nil {
|
||||
log.Fatalf("Original taxon %v is not part of taxon set", id)
|
||||
}
|
||||
set.set[id] = taxon.Node
|
||||
|
||||
set.set[id] = original.Node
|
||||
set.nalias++
|
||||
}
|
||||
|
||||
@@ -196,3 +197,30 @@ func (set *TaxonSet) Contains(id *string) bool {
|
||||
node := set.Get(id)
|
||||
return node != nil
|
||||
}
|
||||
|
||||
func (set *TaxonSet) Sort() *TaxonSlice {
|
||||
if set == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
taxonomy := set.Taxonomy()
|
||||
taxa := taxonomy.NewTaxonSlice(0, set.Len())
|
||||
parent := make(map[*TaxNode]bool, set.Len())
|
||||
|
||||
pushed := true
|
||||
|
||||
for pushed {
|
||||
pushed = false
|
||||
for _, node := range set.set {
|
||||
if !parent[node] && (parent[set.Get(node.parent).Node] ||
|
||||
!set.Contains(node.parent) ||
|
||||
node == taxonomy.Root().Node) {
|
||||
pushed = true
|
||||
taxa.slice = append(taxa.slice, node)
|
||||
parent[node] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return taxa
|
||||
}
|
||||
|
||||
126
pkg/obitools/obiclean/chimera.go
Normal file
126
pkg/obitools/obiclean/chimera.go
Normal file
@@ -0,0 +1,126 @@
|
||||
package obiclean
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func commonPrefix(a, b *obiseq.BioSequence) int {
|
||||
i := 0
|
||||
l := min(a.Len(), b.Len())
|
||||
|
||||
if l == 0 {
|
||||
return 0
|
||||
}
|
||||
as := a.Sequence()
|
||||
bs := b.Sequence()
|
||||
|
||||
for i < l && as[i] == bs[i] {
|
||||
i++
|
||||
}
|
||||
|
||||
if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
|
||||
log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
|
||||
}
|
||||
|
||||
return i
|
||||
}
|
||||
|
||||
func commonSuffix(a, b *obiseq.BioSequence) int {
|
||||
i := a.Len() - 1
|
||||
j := b.Len() - 1
|
||||
|
||||
if i < 0 || j < 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
as := a.Sequence()
|
||||
bs := b.Sequence()
|
||||
|
||||
l := 0
|
||||
for i >= 0 && j >= 0 && as[i] == bs[j] {
|
||||
i--
|
||||
j--
|
||||
l++
|
||||
}
|
||||
|
||||
if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
|
||||
log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
|
||||
}
|
||||
// log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
|
||||
|
||||
return l
|
||||
}
|
||||
|
||||
func AnnotateChimera(samples map[string]*[]*seqPCR) {
|
||||
|
||||
w := func(sample string, seqs *[]*seqPCR) {
|
||||
ls := len(*seqs)
|
||||
cp := make([]int, ls)
|
||||
cs := make([]int, ls)
|
||||
|
||||
pcrs := make([]*seqPCR, 0, ls)
|
||||
|
||||
for _, s := range *seqs {
|
||||
if len(s.Edges) == 0 {
|
||||
pcrs = append(pcrs, s)
|
||||
}
|
||||
}
|
||||
|
||||
lp := len(pcrs)
|
||||
|
||||
sort.Slice(pcrs, func(i, j int) bool {
|
||||
return pcrs[i].Weight < pcrs[j].Weight
|
||||
})
|
||||
|
||||
for i, s := range pcrs {
|
||||
for j := i + 1; j < lp; j++ {
|
||||
s2 := pcrs[j]
|
||||
cp[j] = commonPrefix(s.Sequence, s2.Sequence)
|
||||
cs[j] = commonSuffix(s.Sequence, s2.Sequence)
|
||||
}
|
||||
|
||||
var cm map[string]string
|
||||
var err error
|
||||
|
||||
chimera, ok := s.Sequence.GetAttribute("chimera")
|
||||
|
||||
if !ok {
|
||||
cm = map[string]string{}
|
||||
} else {
|
||||
cm, err = obiutils.InterfaceToStringMap(chimera)
|
||||
if err != nil {
|
||||
log.Fatalf("type of chimera not map[string]string: %T (%v)",
|
||||
chimera, err)
|
||||
}
|
||||
}
|
||||
|
||||
ls := s.Sequence.Len()
|
||||
|
||||
for k := i + 1; k < lp; k++ {
|
||||
for l := i + 1; l < lp; l++ {
|
||||
if k != l && cp[k]+cs[l] == ls {
|
||||
cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)",
|
||||
pcrs[k].Sequence.Id(),
|
||||
pcrs[l].Sequence.Id(),
|
||||
cp[k])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(cm) > 0 {
|
||||
s.Sequence.SetAttribute("chimera", cm)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for sn, sqs := range samples {
|
||||
w(sn, sqs)
|
||||
}
|
||||
|
||||
}
|
||||
@@ -13,23 +13,24 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
|
||||
type Ratio struct {
|
||||
Sample string
|
||||
SeqID string
|
||||
status string
|
||||
From int
|
||||
To int
|
||||
CFrom int
|
||||
CTo int
|
||||
Pos int
|
||||
Length int
|
||||
A int
|
||||
C int
|
||||
G int
|
||||
T int
|
||||
Sample string
|
||||
SeqID string
|
||||
OriginalStatus string
|
||||
WOriginal int
|
||||
WMutant int
|
||||
COriginal int
|
||||
CMutant int
|
||||
Pos int
|
||||
Length int
|
||||
A int
|
||||
C int
|
||||
G int
|
||||
T int
|
||||
}
|
||||
|
||||
type Edge struct {
|
||||
@@ -52,45 +53,21 @@ func makeEdge(father, dist, pos int, from, to byte) Edge {
|
||||
}
|
||||
}
|
||||
|
||||
func abs(x int) int {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
func max(x, y int) int {
|
||||
if x > y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
||||
|
||||
func min(x, y int) int {
|
||||
if x < y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
||||
|
||||
func minMax(x, y int) (int, int) {
|
||||
if x < y {
|
||||
return x, y
|
||||
}
|
||||
return y, x
|
||||
|
||||
}
|
||||
|
||||
// It takes a filename and a 2D slice of floats pruduced during graph building,
|
||||
// and writes a CSV file with the first column being the
|
||||
// first nucleotide, the second column being the second nucleotide, and the third column being the
|
||||
// ratio
|
||||
func EmpiricalDistCsv(filename string, data [][]Ratio) {
|
||||
func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
||||
file, err := os.Create(filename)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
destfile, err := obiutils.CompressStream(file, true, true)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
defer destfile.Close()
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
@@ -103,19 +80,19 @@ func EmpiricalDistCsv(filename string, data [][]Ratio) {
|
||||
|
||||
bar := progressbar.NewOptions(len(data), pbopt...)
|
||||
|
||||
fmt.Fprintln(file, "Sample,Father_id,Father_status,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length,A,C,G,T")
|
||||
fmt.Fprintln(destfile, "Sample,Origin_id,Origin_status,Origin,Mutant,Origin_Weight,Mutant_Weight,Origin_Count,Mutant_Count,Position,Origin_length,A,C,G,T")
|
||||
for code, dist := range data {
|
||||
a1, a2 := intToNucPair(code)
|
||||
for _, ratio := range dist {
|
||||
fmt.Fprintf(file, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
|
||||
fmt.Fprintf(destfile, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
|
||||
ratio.Sample,
|
||||
ratio.SeqID,
|
||||
ratio.status,
|
||||
ratio.OriginalStatus,
|
||||
a1, a2,
|
||||
ratio.From,
|
||||
ratio.To,
|
||||
ratio.CFrom,
|
||||
ratio.CTo,
|
||||
ratio.WOriginal,
|
||||
ratio.WMutant,
|
||||
ratio.COriginal,
|
||||
ratio.CMutant,
|
||||
ratio.Pos,
|
||||
ratio.Length,
|
||||
ratio.A,
|
||||
@@ -478,16 +455,20 @@ func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
|
||||
if father.Weight >= minStatRatio && edge.Dist == 1 {
|
||||
s := father.Sequence.Sequence()
|
||||
ratio[edge.NucPair] = append(ratio[edge.NucPair],
|
||||
Ratio{name,
|
||||
father.Sequence.Id(), Status(father.Sequence)[name],
|
||||
father.Weight, seq.Weight,
|
||||
father.Count, seq.Count,
|
||||
edge.Pos,
|
||||
father.Sequence.Len(),
|
||||
bytes.Count(s, []byte("a")),
|
||||
bytes.Count(s, []byte("c")),
|
||||
bytes.Count(s, []byte("g")),
|
||||
bytes.Count(s, []byte("t"))})
|
||||
Ratio{
|
||||
Sample: name,
|
||||
SeqID: father.Sequence.Id(),
|
||||
OriginalStatus: Status(father.Sequence)[name],
|
||||
WOriginal: father.Weight,
|
||||
WMutant: seq.Weight,
|
||||
COriginal: father.Count,
|
||||
CMutant: seq.Count,
|
||||
Pos: edge.Pos,
|
||||
Length: father.Sequence.Len(),
|
||||
A: bytes.Count(s, []byte("a")),
|
||||
C: bytes.Count(s, []byte("c")),
|
||||
G: bytes.Count(s, []byte("g")),
|
||||
T: bytes.Count(s, []byte("t"))})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ package obiclean
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"maps"
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
@@ -19,6 +20,7 @@ type seqPCR struct {
|
||||
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
|
||||
SonCount int
|
||||
AddedSons int
|
||||
IsHead bool
|
||||
Edges []Edge
|
||||
Cluster map[int]bool // used as the set of head sequences associated to that sequence
|
||||
}
|
||||
@@ -50,6 +52,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
|
||||
Sequence: s,
|
||||
SonCount: 0,
|
||||
AddedSons: 0,
|
||||
IsHead: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -57,9 +60,7 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
|
||||
return samples
|
||||
}
|
||||
|
||||
func annotateOBIClean(source string, dataset obiseq.BioSequenceSlice,
|
||||
sample map[string]*([]*seqPCR),
|
||||
tag, NAValue string) obiiter.IBioSequence {
|
||||
func annotateOBIClean(source string, dataset obiseq.BioSequenceSlice) obiiter.IBioSequence {
|
||||
batchsize := 1000
|
||||
var annot = func(data obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
@@ -114,6 +115,28 @@ func IsHead(sequence *obiseq.BioSequence) bool {
|
||||
return ishead
|
||||
}
|
||||
|
||||
func NotAlwaysChimera(tag string) obiseq.SequencePredicate {
|
||||
descriptor := obiseq.MakeStatsOnDescription(tag)
|
||||
predicat := func(sequence *obiseq.BioSequence) bool {
|
||||
|
||||
chimera, ok := sequence.GetStringMap("chimera")
|
||||
if !ok || len(chimera) == 0 {
|
||||
return true
|
||||
}
|
||||
samples := maps.Keys(sequence.StatsOn(descriptor, "NA"))
|
||||
|
||||
for s := range samples {
|
||||
if _, ok := chimera[s]; !ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
return predicat
|
||||
}
|
||||
|
||||
func HeadCount(sequence *obiseq.BioSequence) int {
|
||||
var err error
|
||||
annotation := sequence.Annotations()
|
||||
@@ -237,6 +260,7 @@ func Mutation(sample map[string]*([]*seqPCR)) {
|
||||
}
|
||||
|
||||
func Status(sequence *obiseq.BioSequence) map[string]string {
|
||||
var err error
|
||||
annotation := sequence.Annotations()
|
||||
iobistatus, ok := annotation["obiclean_status"]
|
||||
var obistatus map[string]string
|
||||
@@ -246,9 +270,9 @@ func Status(sequence *obiseq.BioSequence) map[string]string {
|
||||
case map[string]string:
|
||||
obistatus = iobistatus
|
||||
case map[string]interface{}:
|
||||
obistatus = make(map[string]string)
|
||||
for k, v := range iobistatus {
|
||||
obistatus[k] = fmt.Sprint(v)
|
||||
obistatus, err = obiutils.InterfaceToStringMap(obistatus)
|
||||
if err != nil {
|
||||
log.Panicf("obiclean_status attribute of sequence %s must be castable to a map[string]string", sequence.Id())
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -356,19 +380,30 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
}
|
||||
}
|
||||
|
||||
if DetectChimera() {
|
||||
AnnotateChimera(samples)
|
||||
}
|
||||
|
||||
if SaveGraphToFiles() {
|
||||
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
|
||||
}
|
||||
|
||||
if IsSaveRatioTable() {
|
||||
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
|
||||
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
|
||||
EmpiricalDistCsv(RatioTableFilename(), all_ratio, obidefault.CompressOutput())
|
||||
}
|
||||
|
||||
iter := annotateOBIClean(source, db, samples, SampleAttribute(), "NA")
|
||||
iter := annotateOBIClean(source, db)
|
||||
|
||||
if OnlyHead() {
|
||||
iter = iter.FilterOn(IsHead, 1000)
|
||||
iter = iter.FilterOn(IsHead,
|
||||
obidefault.BatchSize()).FilterOn(NotAlwaysChimera(SampleAttribute()),
|
||||
obidefault.BatchSize())
|
||||
}
|
||||
|
||||
if MinSampleCount() > 1 {
|
||||
sc := obiseq.OccurInAtleast(SampleAttribute(), MinSampleCount())
|
||||
iter = iter.FilterOn(sc, obidefault.BatchSize())
|
||||
}
|
||||
|
||||
return iter
|
||||
|
||||
@@ -16,6 +16,8 @@ var _onlyHead = false
|
||||
|
||||
var _saveGraph = "__@@NOSAVE@@__"
|
||||
var _saveRatio = "__@@NOSAVE@@__"
|
||||
var _minSample = 1
|
||||
var _detectChimera = false
|
||||
|
||||
func ObicleanOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
|
||||
@@ -55,6 +57,13 @@ func ObicleanOptionSet(options *getoptions.GetOpt) {
|
||||
"The ratio file follows the csv format."),
|
||||
)
|
||||
|
||||
options.IntVar(&_minSample, "min-sample-count", _minSample,
|
||||
options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
|
||||
)
|
||||
|
||||
options.BoolVar(&_detectChimera, "detect-chimera", _detectChimera,
|
||||
options.Description("Detect chimera sequences."),
|
||||
)
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
@@ -111,3 +120,13 @@ func IsSaveRatioTable() bool {
|
||||
func RatioTableFilename() string {
|
||||
return _saveRatio
|
||||
}
|
||||
|
||||
// It returns the minimum number of samples a sequence must be present in to be considered in the analysis
|
||||
func MinSampleCount() int {
|
||||
return _minSample
|
||||
}
|
||||
|
||||
// It returns true if chimera detection is enabled
|
||||
func DetectChimera() bool {
|
||||
return _detectChimera
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package obiconvert
|
||||
import (
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
@@ -30,7 +31,6 @@ var __output_fastjson_format__ = false
|
||||
var __output_fastobi_format__ = false
|
||||
|
||||
var __no_progress_bar__ = false
|
||||
var __compressed__ = false
|
||||
var __skip_empty__ = false
|
||||
|
||||
var __output_file_name__ = "-"
|
||||
@@ -71,16 +71,16 @@ func InputOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
}
|
||||
|
||||
func OutputModeOptionSet(options *getoptions.GetOpt) {
|
||||
func OutputModeOptionSet(options *getoptions.GetOpt, compressed bool) {
|
||||
options.BoolVar(&__no_progress_bar__, "no-progressbar", false,
|
||||
options.Description("Disable the progress bar printing"))
|
||||
|
||||
options.BoolVar(&__compressed__, "compress", false,
|
||||
options.Alias("Z"),
|
||||
options.Description("Output is compressed"))
|
||||
if compressed {
|
||||
options.BoolVar(obidefault.CompressedPtr(), "compressed", obidefault.CompressOutput(),
|
||||
options.Alias("Z"),
|
||||
options.Description("Compress all the result using gzip"))
|
||||
|
||||
options.BoolVar(&__skip_empty__, "skip-empty", __skip_empty__,
|
||||
options.Description("Sequences of length equal to zero are suppressed from the output"))
|
||||
}
|
||||
|
||||
options.StringVar(&__output_file_name__, "out", __output_file_name__,
|
||||
options.Alias("o"),
|
||||
@@ -90,6 +90,9 @@ func OutputModeOptionSet(options *getoptions.GetOpt) {
|
||||
}
|
||||
|
||||
func OutputOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&__skip_empty__, "skip-empty", __skip_empty__,
|
||||
options.Description("Sequences of length equal to zero are suppressed from the output"))
|
||||
|
||||
options.BoolVar(&__output_in_fasta__, "fasta-output", false,
|
||||
options.Description("Write sequence in fasta format (default if no quality data available)."))
|
||||
|
||||
@@ -105,7 +108,7 @@ func OutputOptionSet(options *getoptions.GetOpt) {
|
||||
options.Alias("O"),
|
||||
options.Description("output FASTA/FASTQ title line annotations follow OBI format."))
|
||||
|
||||
OutputModeOptionSet(options)
|
||||
OutputModeOptionSet(options, true)
|
||||
}
|
||||
|
||||
func PairedFilesOptionSet(options *getoptions.GetOpt) {
|
||||
@@ -159,10 +162,6 @@ func CLIOutputFormat() string {
|
||||
}
|
||||
}
|
||||
|
||||
func CLICompressed() bool {
|
||||
return __compressed__
|
||||
}
|
||||
|
||||
func CLISkipEmpty() bool {
|
||||
return __skip_empty__
|
||||
}
|
||||
|
||||
@@ -55,6 +55,8 @@ func ExpandListOfFiles(check_ext bool, filenames ...string) ([]string, error) {
|
||||
strings.HasSuffix(path, "fasta.gz") ||
|
||||
strings.HasSuffix(path, "fastq") ||
|
||||
strings.HasSuffix(path, "fastq.gz") ||
|
||||
strings.HasSuffix(path, "fq") ||
|
||||
strings.HasSuffix(path, "fq.gz") ||
|
||||
strings.HasSuffix(path, "seq") ||
|
||||
strings.HasSuffix(path, "seq.gz") ||
|
||||
strings.HasSuffix(path, "gb") ||
|
||||
@@ -140,7 +142,7 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
||||
}
|
||||
|
||||
switch CLIInputFormat() {
|
||||
case "fastq":
|
||||
case "fastq", "fq":
|
||||
reader = obiformats.ReadFastqFromFile
|
||||
case "fasta":
|
||||
reader = obiformats.ReadFastaFromFile
|
||||
@@ -168,22 +170,25 @@ func CLIReadBioSequences(filenames ...string) (obiiter.IBioSequence, error) {
|
||||
opts...,
|
||||
)
|
||||
} else {
|
||||
iterator, err = reader(list_of_files[0], opts...)
|
||||
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
if CLIPairedFileName() != "" {
|
||||
ip, err := reader(CLIPairedFileName(), opts...)
|
||||
if len(list_of_files) > 0 {
|
||||
iterator, err = reader(list_of_files[0], opts...)
|
||||
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
iterator = iterator.PairTo(ip)
|
||||
}
|
||||
if CLIPairedFileName() != "" {
|
||||
ip, err := reader(CLIPairedFileName(), opts...)
|
||||
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
iterator = iterator.PairTo(ip)
|
||||
}
|
||||
} else {
|
||||
iterator = obiiter.NilIBioSequence
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -21,7 +21,7 @@ func BuildPairedFileNames(filename string) (string, string) {
|
||||
forward := parts[0] + "_R1"
|
||||
reverse := parts[0] + "_R2"
|
||||
|
||||
if parts[1] != "" {
|
||||
if len(parts) > 1 && parts[1] != "" {
|
||||
suffix := "." + parts[1]
|
||||
forward += suffix
|
||||
reverse += suffix
|
||||
@@ -58,7 +58,7 @@ func CLIWriteBioSequences(iterator obiiter.IBioSequence,
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers))
|
||||
opts = append(opts, obiformats.OptionsBatchSize(obidefault.BatchSize()))
|
||||
|
||||
opts = append(opts, obiformats.OptionsCompressed(CLICompressed()))
|
||||
opts = append(opts, obiformats.OptionsCompressed(obidefault.CompressOutput()))
|
||||
|
||||
var err error
|
||||
|
||||
|
||||
@@ -33,6 +33,7 @@ func CLIWriteSequenceCSV(iterator obiiter.IBioSequence,
|
||||
CSVDefinition(CLIPrintDefinition()),
|
||||
CSVKeys(CLIToBeKeptAttributes()),
|
||||
CSVSequence(CLIPrintSequence()),
|
||||
CSVQuality(CLIPrintQuality()),
|
||||
CSVAutoColumn(CLIAutoColumns()),
|
||||
)
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ func CSVOptionSet(options *getoptions.GetOpt) {
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.InputOptionSet(options)
|
||||
obiconvert.OutputModeOptionSet(options)
|
||||
obiconvert.OutputModeOptionSet(options, true)
|
||||
obioptions.LoadTaxonomyOptionSet(options, false, false)
|
||||
CSVOptionSet(options)
|
||||
}
|
||||
|
||||
@@ -40,7 +40,7 @@ func CSVSequenceHeader(opt Options) obiitercsv.CSVHeader {
|
||||
}
|
||||
|
||||
if opt.CSVQuality() {
|
||||
record.AppendField("quality")
|
||||
record.AppendField("qualities")
|
||||
}
|
||||
|
||||
return record
|
||||
@@ -100,9 +100,9 @@ func CSVBatchFromSequences(batch obiiter.BioSequenceBatch, opt Options) obiiterc
|
||||
for j := 0; j < l; j++ {
|
||||
ascii[j] = uint8(q[j]) + uint8(quality_shift)
|
||||
}
|
||||
record["quality"] = string(ascii)
|
||||
record["qualities"] = string(ascii)
|
||||
} else {
|
||||
record["quality"] = opt.CSVNAValue()
|
||||
record["qualities"] = opt.CSVNAValue()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ func CLIDistributeSequence(sequences obiiter.IBioSequence) {
|
||||
opts = append(opts, obiformats.OptionsParallelWorkers(nworkers),
|
||||
obiformats.OptionsBatchSize(obidefault.BatchSize()),
|
||||
obiformats.OptionsAppendFile(CLIAppendSequences()),
|
||||
obiformats.OptionsCompressed(obiconvert.CLICompressed()))
|
||||
obiformats.OptionsCompressed(obidefault.CompressOutput()))
|
||||
|
||||
var formater obiformats.SequenceBatchWriterToFile
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
@@ -16,6 +17,7 @@ import (
|
||||
var _BelongTaxa = make([]string, 0)
|
||||
var _NotBelongTaxa = make([]string, 0)
|
||||
var _RequiredRanks = make([]string, 0)
|
||||
var _ValidateTaxonomy = false
|
||||
|
||||
var _MinimumLength = 1
|
||||
var _MaximumLength = int(2e9)
|
||||
@@ -62,6 +64,9 @@ func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
|
||||
options.ArgName("RANK_NAME"),
|
||||
options.Description("Select sequences belonging a taxon with a rank <RANK_NAME>"))
|
||||
|
||||
options.BoolVar(&_ValidateTaxonomy, "valid-taxid", _ValidateTaxonomy,
|
||||
options.Description("Validate the taxonomic classification of the sequences."))
|
||||
|
||||
}
|
||||
|
||||
func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
|
||||
@@ -248,15 +253,15 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
if len(_BelongTaxa) > 0 {
|
||||
taxonomy := obitax.DefaultTaxonomy()
|
||||
|
||||
taxon := taxonomy.Taxon(_BelongTaxa[0])
|
||||
if taxon == nil {
|
||||
taxon, _, err := taxonomy.Taxon(_BelongTaxa[0])
|
||||
if err != nil {
|
||||
p = obiseq.IsSubCladeOfSlot(taxonomy, _BelongTaxa[0])
|
||||
} else {
|
||||
p = obiseq.IsSubCladeOf(taxonomy, taxon)
|
||||
}
|
||||
for _, staxid := range _BelongTaxa[1:] {
|
||||
taxon := taxonomy.Taxon(staxid)
|
||||
if taxon == nil {
|
||||
taxon, _, err := taxonomy.Taxon(staxid)
|
||||
if err != nil {
|
||||
p2 = obiseq.IsSubCladeOfSlot(taxonomy, staxid)
|
||||
} else {
|
||||
p2 = obiseq.IsSubCladeOf(taxonomy, taxon)
|
||||
@@ -271,6 +276,27 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIIsValidTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
if _ValidateTaxonomy {
|
||||
if !obidefault.HasSelectedTaxonomy() {
|
||||
log.Fatal("Taxonomy not found")
|
||||
}
|
||||
taxonomy := obitax.DefaultTaxonomy()
|
||||
if taxonomy == nil {
|
||||
log.Fatal("Taxonomy not found")
|
||||
}
|
||||
|
||||
predicat := func(sequences *obiseq.BioSequence) bool {
|
||||
taxon := sequences.Taxon(taxonomy)
|
||||
return taxon != nil
|
||||
}
|
||||
|
||||
return predicat
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
var p obiseq.SequencePredicate
|
||||
var p2 obiseq.SequencePredicate
|
||||
@@ -278,16 +304,16 @@ func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
if len(_NotBelongTaxa) > 0 {
|
||||
taxonomy := obitax.DefaultTaxonomy()
|
||||
|
||||
taxon := taxonomy.Taxon(_NotBelongTaxa[0])
|
||||
if taxon == nil {
|
||||
taxon, _, err := taxonomy.Taxon(_NotBelongTaxa[0])
|
||||
if err != nil {
|
||||
p = obiseq.IsSubCladeOfSlot(taxonomy, _NotBelongTaxa[0])
|
||||
} else {
|
||||
p = obiseq.IsSubCladeOf(taxonomy, taxon)
|
||||
}
|
||||
|
||||
for _, taxid := range _NotBelongTaxa[1:] {
|
||||
taxon := taxonomy.Taxon(taxid)
|
||||
if taxon == nil {
|
||||
taxon, _, err := taxonomy.Taxon(taxid)
|
||||
if err != nil {
|
||||
p2 = obiseq.IsSubCladeOfSlot(taxonomy, taxid)
|
||||
} else {
|
||||
p2 = obiseq.IsSubCladeOf(taxonomy, taxon)
|
||||
@@ -319,7 +345,7 @@ func CLIHasRankDefinedPredicate() obiseq.SequencePredicate {
|
||||
}
|
||||
|
||||
func CLITaxonomyFilterPredicate() obiseq.SequencePredicate {
|
||||
return CLIHasRankDefinedPredicate().And(CLIRestrictTaxonomyPredicate()).And(CLIAvoidTaxonomyPredicate())
|
||||
return CLIIsValidTaxonomyPredicate().And(CLIAvoidTaxonomyPredicate()).And(CLIHasRankDefinedPredicate()).And(CLIRestrictTaxonomyPredicate())
|
||||
}
|
||||
|
||||
func CLIPredicatesPredicate() obiseq.SequencePredicate {
|
||||
|
||||
@@ -129,6 +129,7 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
|
||||
}
|
||||
lcons := cons.Len()
|
||||
aliLength := lcons - _Abs(left) - _Abs(right)
|
||||
|
||||
identity := float64(match) / float64(aliLength)
|
||||
if aliLength == 0 {
|
||||
identity = 0
|
||||
@@ -237,7 +238,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
|
||||
log.Printf("End of the sequence Pairing")
|
||||
}()
|
||||
|
||||
f := func(iterator obiiter.IBioSequence, wid int) {
|
||||
f := func(iterator obiiter.IBioSequence) {
|
||||
arena := obialign.MakePEAlignArena(150, 150)
|
||||
shifts := make(map[int]int)
|
||||
|
||||
@@ -262,9 +263,9 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
|
||||
log.Printf("Start of the sequence Pairing using %d workers\n", nworkers)
|
||||
|
||||
for i := 0; i < nworkers-1; i++ {
|
||||
go f(iterator.Split(), i)
|
||||
go f(iterator.Split())
|
||||
}
|
||||
go f(iterator, nworkers-1)
|
||||
go f(iterator)
|
||||
return newIter
|
||||
|
||||
}
|
||||
|
||||
@@ -42,9 +42,10 @@ func MatchDistanceIndex(taxonomy *obitax.Taxonomy, distance int, distanceIdx map
|
||||
if i == len(keys) || distance > keys[len(keys)-1] {
|
||||
taxon = taxonomy.Root()
|
||||
} else {
|
||||
taxon = taxonomy.Taxon(distanceIdx[keys[i]])
|
||||
if taxon == nil {
|
||||
log.Panicf("Cannot identify taxon %s in %s", distanceIdx[keys[i]], taxonomy.Name())
|
||||
var err error
|
||||
taxon, _, err = taxonomy.Taxon(distanceIdx[keys[i]])
|
||||
if err != nil {
|
||||
log.Panicf("Cannot identify taxon %s in %s (%v)", distanceIdx[keys[i]], taxonomy.Name(), err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -72,6 +73,10 @@ func FindClosests(sequence *obiseq.BioSequence,
|
||||
refcounts []*obikmer.Table4mer,
|
||||
runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) {
|
||||
|
||||
if sequence.Len() < 5 {
|
||||
return obiseq.BioSequenceSlice{}, 1000, 0, "NA", []int{}
|
||||
}
|
||||
|
||||
var matrix []uint64
|
||||
|
||||
seqwords := obikmer.Count4Mer(sequence, nil, nil)
|
||||
@@ -196,9 +201,9 @@ func Identify(sequence *obiseq.BioSequence,
|
||||
log.Panic("Problem in identification line : ", best.Id(), "idx:", idx, "distance:", d)
|
||||
}
|
||||
|
||||
match_taxon := taxo.Taxon(identification)
|
||||
match_taxon, _, err := taxo.Taxon(identification)
|
||||
|
||||
if taxon != nil {
|
||||
if err == nil {
|
||||
taxon, _ = taxon.LCA(match_taxon)
|
||||
} else {
|
||||
taxon = match_taxon
|
||||
@@ -255,7 +260,7 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
|
||||
if taxon != nil {
|
||||
j++
|
||||
} else {
|
||||
log.Warnf("Taxid %d is not described in the taxonomy %s."+
|
||||
log.Warnf("Taxid %s is not described in the taxonomy %s."+
|
||||
" Sequence %s is discared from the reference database",
|
||||
seq.Taxid(), taxo.Name(), seq.Id())
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
@@ -43,7 +42,6 @@ func TagOptionSet(options *getoptions.GetOpt) {
|
||||
// the obiuniq command
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
obioptions.LoadTaxonomyOptionSet(options, true, false)
|
||||
TagOptionSet(options)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
package obitaxonomy
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiitercsv"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obicsv"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
@@ -73,3 +78,18 @@ func CLICSVTaxaIterator(iterator *obitax.ITaxon) *obiitercsv.ICSVRecord {
|
||||
func CLICSVTaxaWriter(iterator *obitax.ITaxon, terminalAction bool) *obiitercsv.ICSVRecord {
|
||||
return obicsv.CLICSVWriter(CLICSVTaxaIterator(iterator), terminalAction)
|
||||
}
|
||||
|
||||
func CLIDownloadNCBITaxdump() error {
|
||||
now := time.Now()
|
||||
dateStr := now.Format("20060102") // In Go, this specific date is used as reference for formatting
|
||||
|
||||
filename := fmt.Sprintf("ncbitaxo_%s.tgz", dateStr)
|
||||
|
||||
if obiconvert.CLIOutPutFileName() != "-" {
|
||||
filename = obiconvert.CLIOutPutFileName()
|
||||
}
|
||||
|
||||
log.Infof("Downloading NCBI Taxdump to %s", filename)
|
||||
return obiutils.DownloadFile("https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz", filename)
|
||||
|
||||
}
|
||||
@@ -5,6 +5,7 @@ import (
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
)
|
||||
|
||||
@@ -22,6 +23,8 @@ var __taxid_path__ = "NA"
|
||||
var __taxid_sons__ = "NA"
|
||||
var __restrict_rank__ = ""
|
||||
var __to_dump__ = ""
|
||||
var __download_ncbi__ = false
|
||||
var __extract_taxonomy__ = false
|
||||
|
||||
func FilterTaxonomyOptionSet(options *getoptions.GetOpt) {
|
||||
options.BoolVar(&__rank_list__, "rank-list", false,
|
||||
@@ -34,7 +37,8 @@ func FilterTaxonomyOptionSet(options *getoptions.GetOpt) {
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obioptions.LoadTaxonomyOptionSet(options, true, true)
|
||||
obioptions.LoadTaxonomyOptionSet(options, false, true)
|
||||
obiconvert.OutputModeOptionSet(options, false)
|
||||
FilterTaxonomyOptionSet(options)
|
||||
options.BoolVar(&__fixed_pattern__, "fixed", false,
|
||||
options.Alias("F"),
|
||||
@@ -70,6 +74,12 @@ func OptionSet(options *getoptions.GetOpt) {
|
||||
options.ArgName("TAXID"),
|
||||
options.Description("Dump a sub-taxonomy corresponding to the precised clade"),
|
||||
)
|
||||
options.BoolVar(&__download_ncbi__, "download-ncbi", __download_ncbi__,
|
||||
options.Description("Download the current NCBI taxonomy taxdump"),
|
||||
)
|
||||
options.BoolVar(&__extract_taxonomy__, "extract-taxonomy", __extract_taxonomy__,
|
||||
options.Description("Extract taxonomy from a sequence file"),
|
||||
)
|
||||
}
|
||||
|
||||
func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) {
|
||||
@@ -81,13 +91,14 @@ func CLITaxonomicalRestrictions() (*obitax.TaxonSet, error) {
|
||||
|
||||
ts := taxonomy.NewTaxonSet()
|
||||
for _, taxid := range __taxonomical_restriction__ {
|
||||
tx := taxonomy.Taxon(taxid)
|
||||
tx, _, err := taxonomy.Taxon(taxid)
|
||||
|
||||
if tx == nil {
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf(
|
||||
"cannot find taxon %s in taxonomy %s",
|
||||
"cannot find taxon %s in taxonomy %s (%v)",
|
||||
taxid,
|
||||
taxonomy.Name(),
|
||||
err,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -144,3 +155,11 @@ func CLIDumpSubtaxonomy() bool {
|
||||
func CLISubTaxonomyNode() string {
|
||||
return __to_dump__
|
||||
}
|
||||
|
||||
func CLIDownloadNCBI() bool {
|
||||
return __download_ncbi__
|
||||
}
|
||||
|
||||
func CLIExtractTaxonomy() bool {
|
||||
return __extract_taxonomy__
|
||||
}
|
||||
|
||||
@@ -93,3 +93,145 @@ func MapToMapInterface(m interface{}) map[string]interface{} {
|
||||
log.Panic("Invalid map type")
|
||||
return make(map[string]interface{})
|
||||
}
|
||||
|
||||
// InterfaceToInt converts a interface{} to an integer value if possible.
|
||||
// If not a "NotAnInteger" error is returned via the err
|
||||
// return value and val is set to 0.
|
||||
func InterfaceToInt(i interface{}) (val int, err error) {
|
||||
|
||||
err = nil
|
||||
val = 0
|
||||
|
||||
switch t := i.(type) {
|
||||
case int:
|
||||
val = t
|
||||
case int8:
|
||||
val = int(t) // standardizes across systems
|
||||
case int16:
|
||||
val = int(t) // standardizes across systems
|
||||
case int32:
|
||||
val = int(t) // standardizes across systems
|
||||
case int64:
|
||||
val = int(t) // standardizes across systems
|
||||
case float32:
|
||||
val = int(t) // standardizes across systems
|
||||
case float64:
|
||||
val = int(t) // standardizes across systems
|
||||
case uint8:
|
||||
val = int(t) // standardizes across systems
|
||||
case uint16:
|
||||
val = int(t) // standardizes across systems
|
||||
case uint32:
|
||||
val = int(t) // standardizes across systems
|
||||
case uint64:
|
||||
val = int(t) // standardizes across systems
|
||||
default:
|
||||
err = &NotAnInteger{"value attribute cannot be casted to an integer"}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// InterfaceToInt converts a interface{} to an integer value if possible.
|
||||
// If not a "NotAnInteger" error is returned via the err
|
||||
// return value and val is set to 0.
|
||||
func InterfaceToFloat64(i interface{}) (val float64, err error) {
|
||||
|
||||
err = nil
|
||||
val = 0
|
||||
|
||||
switch t := i.(type) {
|
||||
case int:
|
||||
val = float64(t)
|
||||
case int8:
|
||||
val = float64(t) // standardizes across systems
|
||||
case int16:
|
||||
val = float64(t) // standardizes across systems
|
||||
case int32:
|
||||
val = float64(t) // standardizes across systems
|
||||
case int64:
|
||||
val = float64(t) // standardizes across systems
|
||||
case float32:
|
||||
val = float64(t) // standardizes across systems
|
||||
case float64:
|
||||
val = t // standardizes across systems
|
||||
case uint8:
|
||||
val = float64(t) // standardizes across systems
|
||||
case uint16:
|
||||
val = float64(t) // standardizes across systems
|
||||
case uint32:
|
||||
val = float64(t) // standardizes across systems
|
||||
case uint64:
|
||||
val = float64(t) // standardizes across systems
|
||||
default:
|
||||
err = &NotAnFloat64{"value attribute cannot be casted to a float value"}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
|
||||
err = nil
|
||||
|
||||
switch i := i.(type) {
|
||||
case map[string]int:
|
||||
val = i
|
||||
case map[string]interface{}:
|
||||
val = make(map[string]int, len(i))
|
||||
for k, v := range i {
|
||||
val[k], err = InterfaceToInt(v)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
case map[string]float64:
|
||||
val = make(map[string]int, len(i))
|
||||
for k, v := range i {
|
||||
val[k] = int(v)
|
||||
}
|
||||
default:
|
||||
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
|
||||
err = nil
|
||||
|
||||
switch i := i.(type) {
|
||||
case map[string]string:
|
||||
val = i
|
||||
case map[string]interface{}:
|
||||
val = make(map[string]string, len(i))
|
||||
for k, v := range i {
|
||||
val[k], err = InterfaceToString(v)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
default:
|
||||
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func InterfaceToStringSlice(i interface{}) (val []string, err error) {
|
||||
err = nil
|
||||
|
||||
switch i := i.(type) {
|
||||
case []string:
|
||||
val = i
|
||||
case []interface{}:
|
||||
val = make([]string, len(i))
|
||||
for k, v := range i {
|
||||
val[k], err = InterfaceToString(v)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
default:
|
||||
err = &NotAMapInt{"value attribute cannot be casted to a []string"}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
45
pkg/obiutils/download.go
Normal file
45
pkg/obiutils/download.go
Normal file
@@ -0,0 +1,45 @@
|
||||
package obiutils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
|
||||
func DownloadFile(url string, filepath string) error {
|
||||
// Get the data
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// Check server response
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("bad status: %s", resp.Status)
|
||||
}
|
||||
|
||||
// Create the file
|
||||
out, err := os.Create(filepath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
// Create progress bar
|
||||
bar := progressbar.DefaultBytes(
|
||||
resp.ContentLength,
|
||||
"downloading",
|
||||
)
|
||||
|
||||
// Write the body to file while updating the progress bar
|
||||
_, err = io.Copy(io.MultiWriter(out, bar), resp.Body)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
@@ -25,43 +25,6 @@ func (m *NotAnInteger) Error() string {
|
||||
return m.message
|
||||
}
|
||||
|
||||
// InterfaceToInt converts a interface{} to an integer value if possible.
|
||||
// If not a "NotAnInteger" error is returned via the err
|
||||
// return value and val is set to 0.
|
||||
func InterfaceToInt(i interface{}) (val int, err error) {
|
||||
|
||||
err = nil
|
||||
val = 0
|
||||
|
||||
switch t := i.(type) {
|
||||
case int:
|
||||
val = t
|
||||
case int8:
|
||||
val = int(t) // standardizes across systems
|
||||
case int16:
|
||||
val = int(t) // standardizes across systems
|
||||
case int32:
|
||||
val = int(t) // standardizes across systems
|
||||
case int64:
|
||||
val = int(t) // standardizes across systems
|
||||
case float32:
|
||||
val = int(t) // standardizes across systems
|
||||
case float64:
|
||||
val = int(t) // standardizes across systems
|
||||
case uint8:
|
||||
val = int(t) // standardizes across systems
|
||||
case uint16:
|
||||
val = int(t) // standardizes across systems
|
||||
case uint32:
|
||||
val = int(t) // standardizes across systems
|
||||
case uint64:
|
||||
val = int(t) // standardizes across systems
|
||||
default:
|
||||
err = &NotAnInteger{"value attribute cannot be casted to an integer"}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// NotAnInteger defines a new type of Error : "NotAnInteger"
|
||||
type NotAnFloat64 struct {
|
||||
message string
|
||||
@@ -74,43 +37,6 @@ func (m *NotAnFloat64) Error() string {
|
||||
return m.message
|
||||
}
|
||||
|
||||
// InterfaceToInt converts a interface{} to an integer value if possible.
|
||||
// If not a "NotAnInteger" error is returned via the err
|
||||
// return value and val is set to 0.
|
||||
func InterfaceToFloat64(i interface{}) (val float64, err error) {
|
||||
|
||||
err = nil
|
||||
val = 0
|
||||
|
||||
switch t := i.(type) {
|
||||
case int:
|
||||
val = float64(t)
|
||||
case int8:
|
||||
val = float64(t) // standardizes across systems
|
||||
case int16:
|
||||
val = float64(t) // standardizes across systems
|
||||
case int32:
|
||||
val = float64(t) // standardizes across systems
|
||||
case int64:
|
||||
val = float64(t) // standardizes across systems
|
||||
case float32:
|
||||
val = float64(t) // standardizes across systems
|
||||
case float64:
|
||||
val = t // standardizes across systems
|
||||
case uint8:
|
||||
val = float64(t) // standardizes across systems
|
||||
case uint16:
|
||||
val = float64(t) // standardizes across systems
|
||||
case uint32:
|
||||
val = float64(t) // standardizes across systems
|
||||
case uint64:
|
||||
val = float64(t) // standardizes across systems
|
||||
default:
|
||||
err = &NotAnFloat64{"value attribute cannot be casted to a float value"}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// NotABoolean defines a new type of Error : "NotAMapInt"
|
||||
type NotAMapInt struct {
|
||||
message string
|
||||
@@ -123,53 +49,6 @@ func (m *NotAMapInt) Error() string {
|
||||
return m.message
|
||||
}
|
||||
|
||||
func InterfaceToIntMap(i interface{}) (val map[string]int, err error) {
|
||||
err = nil
|
||||
|
||||
switch i := i.(type) {
|
||||
case map[string]int:
|
||||
val = i
|
||||
case map[string]interface{}:
|
||||
val = make(map[string]int, len(i))
|
||||
for k, v := range i {
|
||||
val[k], err = InterfaceToInt(v)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
case map[string]float64:
|
||||
val = make(map[string]int, len(i))
|
||||
for k, v := range i {
|
||||
val[k] = int(v)
|
||||
}
|
||||
default:
|
||||
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func InterfaceToStringMap(i interface{}) (val map[string]string, err error) {
|
||||
err = nil
|
||||
|
||||
switch i := i.(type) {
|
||||
case map[string]string:
|
||||
val = i
|
||||
case map[string]interface{}:
|
||||
val = make(map[string]string, len(i))
|
||||
for k, v := range i {
|
||||
val[k], err = InterfaceToString(v)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
default:
|
||||
err = &NotAMapInt{"value attribute cannot be casted to a map[string]int"}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// NotABoolean defines a new type of Error : "NotAMapInt"
|
||||
type NotAMapFloat64 struct {
|
||||
message string
|
||||
|
||||
@@ -23,7 +23,7 @@ func MakeSet[E comparable](vals ...E) Set[E] {
|
||||
// It takes a variadic parameter of type E, where E is a comparable type.
|
||||
// It returns a pointer to a Set of type E.
|
||||
func NewSet[E comparable](vals ...E) *Set[E] {
|
||||
s := MakeSet[E](vals...)
|
||||
s := MakeSet(vals...)
|
||||
return &s
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ func TestNewSet(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test Case 2: Creating a set with multiple values
|
||||
set2 := NewSet[string]("apple", "banana", "cherry")
|
||||
set2 := NewSet("apple", "banana", "cherry")
|
||||
if len(*set2) != 3 {
|
||||
t.Errorf("Expected size to be 3, but got %d", len(*set2))
|
||||
}
|
||||
@@ -147,7 +147,7 @@ func TestMembers(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test case 2: Set with multiple elements
|
||||
set = MakeSet[int](1, 2, 3)
|
||||
set = MakeSet(1, 2, 3)
|
||||
expected = []int{1, 2, 3}
|
||||
actual = set.Members()
|
||||
sort.Ints(actual)
|
||||
@@ -172,7 +172,7 @@ func TestSetString(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test set with single member
|
||||
singleMemberSet := NewSet[int](42)
|
||||
singleMemberSet := NewSet(42)
|
||||
singleMemberSetString := singleMemberSet.String()
|
||||
expectedSingleMemberSetString := "[42]"
|
||||
if singleMemberSetString != expectedSingleMemberSetString {
|
||||
@@ -180,7 +180,7 @@ func TestSetString(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test set with multiple members
|
||||
multipleMembersSet := NewSet[int](1, 2, 3)
|
||||
multipleMembersSet := NewSet(1, 2, 3)
|
||||
multipleMembersSetString := multipleMembersSet.String()
|
||||
expectedMultipleMembersSetString := "[1 2 3]"
|
||||
if multipleMembersSetString != expectedMultipleMembersSetString {
|
||||
@@ -213,26 +213,26 @@ func TestUnion(t *testing.T) {
|
||||
|
||||
// Test case 2: Union of an empty set and a non-empty set should return the non-empty set
|
||||
set1 = MakeSet[int]()
|
||||
set2 = MakeSet[int](1, 2, 3)
|
||||
expected = MakeSet[int](1, 2, 3)
|
||||
set2 = MakeSet(1, 2, 3)
|
||||
expected = MakeSet(1, 2, 3)
|
||||
result = set1.Union(set2)
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
t.Errorf("Expected %v, but got %v", expected, result)
|
||||
}
|
||||
|
||||
// Test case 3: Union of two non-empty sets with common elements should return a set with unique elements
|
||||
set1 = MakeSet[int](1, 2, 3)
|
||||
set2 = MakeSet[int](2, 3, 4)
|
||||
expected = MakeSet[int](1, 2, 3, 4)
|
||||
set1 = MakeSet(1, 2, 3)
|
||||
set2 = MakeSet(2, 3, 4)
|
||||
expected = MakeSet(1, 2, 3, 4)
|
||||
result = set1.Union(set2)
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
t.Errorf("Expected %v, but got %v", expected, result)
|
||||
}
|
||||
|
||||
// Test case 4: Union of two non-empty sets with no common elements should return a set with all elements
|
||||
set1 = MakeSet[int](1, 2, 3)
|
||||
set2 = MakeSet[int](4, 5, 6)
|
||||
expected = MakeSet[int](1, 2, 3, 4, 5, 6)
|
||||
set1 = MakeSet(1, 2, 3)
|
||||
set2 = MakeSet(4, 5, 6)
|
||||
expected = MakeSet(1, 2, 3, 4, 5, 6)
|
||||
result = set1.Union(set2)
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
t.Errorf("Expected %v, but got %v", expected, result)
|
||||
|
||||
Reference in New Issue
Block a user