mirror of
https://github.com/metabarcoding/obitools4.git
synced 2026-03-26 22:00:52 +00:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d70b0a5b42 |
19
.github/workflows/obitest.yml
vendored
19
.github/workflows/obitest.yml
vendored
@@ -1,19 +0,0 @@
|
||||
name: "Run the obitools command test suite"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- V*
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: '1.23'
|
||||
- name: Checkout obitools4 project
|
||||
uses: actions/checkout@v4
|
||||
- name: Run tests
|
||||
run: make githubtests
|
||||
154
.gitignore
vendored
154
.gitignore
vendored
@@ -1,27 +1,135 @@
|
||||
**/cpu.pprof
|
||||
**/cpu.trace
|
||||
**/test
|
||||
**/bin
|
||||
**/vendor
|
||||
**/*.fastq
|
||||
**/*.fasta
|
||||
**/*.fastq.gz
|
||||
**/*.fasta.gz
|
||||
**/.DS_Store
|
||||
**/*.gml
|
||||
**/*.log
|
||||
**/xxx*
|
||||
**/*.sav
|
||||
**/*.old
|
||||
**/*.tgz
|
||||
**/*.yaml
|
||||
**/*.csv
|
||||
cpu.pprof
|
||||
cpu.trace
|
||||
test
|
||||
bin
|
||||
vendor
|
||||
*.fastq
|
||||
*.fasta
|
||||
*.fastq.gz
|
||||
*.fasta.gz
|
||||
.DS_Store
|
||||
*.gml
|
||||
*.log
|
||||
/argaly
|
||||
|
||||
.rhistory
|
||||
/.vscode
|
||||
/obiconvert
|
||||
/obicount
|
||||
/obimultiplex
|
||||
/obipairing
|
||||
/obipcr
|
||||
/obifind
|
||||
/obidistribute
|
||||
/obiuniq
|
||||
/build
|
||||
/Makefile.old
|
||||
.Rproj.user
|
||||
obitools.Rproj
|
||||
Stat_error.knit.md
|
||||
.Rhistory
|
||||
Stat_error.nb.html
|
||||
Stat_error.Rmd
|
||||
|
||||
/ncbitaxo
|
||||
/.luarc.json
|
||||
/doc/TAXO/
|
||||
/doc/results/
|
||||
/doc/_main.log
|
||||
/doc/_book/_main.tex
|
||||
/doc/_freeze/
|
||||
/doc/tutorial_files/
|
||||
/doc/wolf_data/
|
||||
/taxdump/
|
||||
/.vscode/
|
||||
|
||||
!/obitests/**
|
||||
!/sample/**
|
||||
/Algo-Alignement.numbers
|
||||
/Estimate_proba_true_seq.html
|
||||
/Estimate_proba_true_seq.nb.html
|
||||
/Estimate_proba_true_seq.Rmd
|
||||
/modele_error_euka.qmd
|
||||
/obitools.code-workspace
|
||||
.DS_Store
|
||||
.RData
|
||||
x
|
||||
xxx
|
||||
y
|
||||
/doc/wolf_diet.tgz
|
||||
/doc/man/depends
|
||||
/sample/wolf_R1.fasta.gz
|
||||
/sample/wolf_R2.fasta.gz
|
||||
/sample/euka03.ecotag.fasta.gz
|
||||
/sample/ratio.csv
|
||||
/sample/STD_PLN_1.dat
|
||||
/sample/STD_PLN_2.dat
|
||||
/sample/subset_Pasvik_R1.fastq.gz
|
||||
/sample/subset_Pasvik_R2.fastq.gz
|
||||
/sample/test_gobitools.fasta.bz2
|
||||
euka03.csv*
|
||||
gbbct793.seq.gz
|
||||
gbinv1003.seq.gz
|
||||
gbpln210.seq
|
||||
/doc/book/OBITools-V4.aux
|
||||
/doc/book/OBITools-V4.fdb_latexmk
|
||||
/doc/book/OBITools-V4.fls
|
||||
/doc/book/OBITools-V4.log
|
||||
/doc/book/OBITools-V4.pdf
|
||||
/doc/book/OBITools-V4.synctex.gz
|
||||
/doc/book/OBITools-V4.tex
|
||||
/doc/book/OBITools-V4.toc
|
||||
getoptions.adoc
|
||||
Archive.zip
|
||||
.DS_Store
|
||||
sample/.DS_Store
|
||||
sample/consensus_graphs/specimen_hac_plants_Vern_disicolor_.gml
|
||||
93954
|
||||
Bact03.e5.gb_R254.obipcr.idx.fasta.save
|
||||
sample/test.obipcr.log
|
||||
Bact02.e3.gb_R254.obipcr.fasta.gz
|
||||
Example_Arth03.ngsfilter
|
||||
SPER01.csv
|
||||
SPER03.csv
|
||||
wolf_diet_ngsfilter.txt
|
||||
xx
|
||||
xxx.gb
|
||||
yyy_geom.csv
|
||||
yyy_LCS.csv
|
||||
yyy.json
|
||||
bug_obimultiplex/toto
|
||||
bug_obimultiplex/toto_mapping
|
||||
bug_obimultiplex/tutu
|
||||
bug_obimultiplex/tutu_mapping
|
||||
bug_obipairing/GIT1_GH_ngsfilter.txt
|
||||
doc/book/TAXO/citations.dmp
|
||||
doc/book/TAXO/delnodes.dmp
|
||||
doc/book/TAXO/division.dmp
|
||||
doc/book/TAXO/gc.prt
|
||||
doc/book/TAXO/gencode.dmp
|
||||
doc/book/TAXO/merged.dmp
|
||||
doc/book/TAXO/names.dmp
|
||||
doc/book/TAXO/nodes.dmp
|
||||
doc/book/TAXO/readme.txt
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/citations.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/delnodes.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/division.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/gc.prt
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/gencode.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/merged.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/names.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/nodes.dmp
|
||||
doc/book/wolf_data/Release-253/ncbitaxo/readme.txt
|
||||
doc/book/results/toto.tasta
|
||||
sample/.DS_Store
|
||||
GO
|
||||
ncbitaxo/citations.dmp
|
||||
ncbitaxo/delnodes.dmp
|
||||
ncbitaxo/division.dmp
|
||||
ncbitaxo/gc.prt
|
||||
ncbitaxo/gencode.dmp
|
||||
ncbitaxo/merged.dmp
|
||||
ncbitaxo/names.dmp
|
||||
ncbitaxo/nodes.dmp
|
||||
ncbitaxo/readme.txt
|
||||
template.16S
|
||||
xxx.gz
|
||||
*.sav
|
||||
*.old
|
||||
ncbitaxo.tgz
|
||||
*.csv
|
||||
|
||||
9
Makefile
9
Makefile
@@ -63,13 +63,6 @@ update-deps:
|
||||
|
||||
test:
|
||||
$(GOTEST) ./...
|
||||
|
||||
obitests:
|
||||
@for t in $$(find obitests -name test.sh -print) ; do \
|
||||
bash $${t} ;\
|
||||
done
|
||||
|
||||
githubtests: obitools obitests
|
||||
|
||||
man:
|
||||
make -C doc man
|
||||
@@ -104,5 +97,5 @@ ifneq ($(strip $(COMMIT_ID)),)
|
||||
@rm -f $(OUTPUT)
|
||||
endif
|
||||
|
||||
.PHONY: all packages obitools man obibook doc update-deps obitests githubtests .FORCE
|
||||
.PHONY: all packages obitools man obibook doc update-deps .FORCE
|
||||
.FORCE:
|
||||
@@ -37,7 +37,7 @@ curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install
|
||||
bash -s -- --install-dir test_install --obitools-prefix k
|
||||
```
|
||||
|
||||
In this case, the binaries will be installed in the `test_install` directory and all command names will be prefixed with the letter `k`. Thus, `obigrep` will be named `kobigrep`.
|
||||
In this case, the binaries will be installed in the `test_install` directory and all command names will be prefixed with the letter `k`. Thus `obigrep` will be named `kobigrep`.
|
||||
|
||||
## Continuing the analysis...
|
||||
|
||||
|
||||
228
Release-notes.md
228
Release-notes.md
@@ -1,29 +1,19 @@
|
||||
# OBITools release notes
|
||||
|
||||
## March 2nd, 2025. Release 4.3.0
|
||||
|
||||
A new documentation website is available at https://obitools4.metabarcoding.org.
|
||||
Its development is still in progress.
|
||||
## Latest changes
|
||||
|
||||
### Breaking changes
|
||||
|
||||
- In `obimultiplex`, the short version of the **--tag-list** option used to
|
||||
specify the list of tags and primers to be used for the demultiplexing has
|
||||
been changed from `-t` to `-s`.
|
||||
- In `obimultiplex`, the short version of the **--tag-list** option used to specify the list
|
||||
of tags and primers to be used for the demultiplexing has been changed from `-t` to `-s`.
|
||||
|
||||
- The command `obifind` is now renamed `obitaxonomy`.
|
||||
|
||||
- The **--taxdump** option used to specify the path to the taxdump containing
|
||||
the NCBI taxonomy has been renamed to **--taxonomy**.
|
||||
- The **--taxdump** option used to specify the path to the taxdump containing the NCBI taxonomy
|
||||
has been renamed to **--taxonomy**.
|
||||
|
||||
### Bug fixes
|
||||
|
||||
- Correction of a bug when using paired sequence file with the **--out** option.
|
||||
|
||||
- Correction of a bug in `obitag` when trying to annotate very short sequences of
|
||||
4 bases or less.
|
||||
|
||||
|
||||
- In `obipairing`, correct the stats `seq_a_single` and `seq_b_single` when
|
||||
on right alignment mode
|
||||
|
||||
@@ -31,32 +21,12 @@ Its development is still in progress.
|
||||
the batch size and not reading the qualities from the fastq files as `obiuniq`
|
||||
is producing only fasta output without qualities.
|
||||
|
||||
- In `obitag`, correct the wrong assignment of the **obitag_bestmatch**
|
||||
attribute.
|
||||
|
||||
- In `obiclean`, the **--no-progress-bar** option disables all progress bars,
|
||||
not just the data.
|
||||
|
||||
- Several fixes in reading FASTA and FASTQ files, including some code
|
||||
simplification and factorization.
|
||||
|
||||
- Fixed a bug in all obitools that caused the same file to be processed
|
||||
multiple times, when specifying a directory name as input.
|
||||
|
||||
|
||||
### New features
|
||||
|
||||
- `obigrep` add a new **--valid-taxid** option to keep only sequence with a
|
||||
valid taxid
|
||||
|
||||
- `obiclean` add a new **--min-sample-count** option with a default value of 1,
|
||||
asking to filter out sequences which are not occurring in at least the
|
||||
specified number of samples.
|
||||
|
||||
- `obitoaxonomy` a new **--dump|D** option allows for dumping a sub-taxonomy.
|
||||
|
||||
- Taxonomy dump can now be provided as a four-columns CSV file to the
|
||||
**--taxonomy** option.
|
||||
- Taxonomy dump can now be provided as a four-columns CSV file to the **--taxonomy**
|
||||
option.
|
||||
|
||||
- NCBI Taxonomy dump does not need to be uncompressed and unarchived anymore. The
|
||||
path of the tar and gziped dump file can be directly specified using the
|
||||
@@ -67,50 +37,54 @@ Its development is still in progress.
|
||||
allow the processing of the rare fasta and fastq files not recognized.
|
||||
|
||||
- In `obiscript`, adds new methods to the Lua sequence object:
|
||||
- `md5_string()`: returning the MD5 check sum as a hexadecimal string,
|
||||
- `subsequence(from,to)`: allows extracting a subsequence on a 0 based
|
||||
coordinate system, upper bound excluded like in go.
|
||||
- `reverse_complement`: returning a sequence object corresponding to the
|
||||
reverse complement of the current sequence.
|
||||
- `md5_string()`: returning the MD5 check sum as an hexadecimal string,
|
||||
- `subsequence(from,to)`: allows to extract a subsequence on a 0 based
|
||||
coordinate system, upper bound expluded like in go.
|
||||
- `reverse_complement`: returning a sequence object corresponding to the reverse complement
|
||||
of the current sequence.
|
||||
|
||||
### Enhancement
|
||||
### Change of git repositiory
|
||||
|
||||
- In every *OBITools* command, the progress bar is automatically deactivated
|
||||
when the standard error output is redirected.
|
||||
- Because Genbank and ENA:EMBL contain very large sequences, while OBITools4
|
||||
are optimized As Genbank and ENA:EMBL contain very large sequences, while
|
||||
OBITools4 is optimized for short sequences, `obipcr` faces some problems
|
||||
with excessive consumption of computer resources, especially memory. Several
|
||||
improvements in the tuning of the default `obipcr` parameters and some new
|
||||
features, currently only available for FASTA and FASTQ file readers, have
|
||||
been implemented to limit the memory impact of `obipcr` without changing the
|
||||
computational efficiency too much.
|
||||
- Logging system and therefore format, have been homogenized.
|
||||
|
||||
|
||||
### Change of git repository
|
||||
|
||||
- The OBITools4 git repository has been moved to the GitHub repository.
|
||||
- The OBITools4 git repository has been moved to the github repository.
|
||||
The new address is: https://github.com/metabarcoding/obitools4.
|
||||
Take care for using the new install script for retrieving the new version.
|
||||
|
||||
```bash
|
||||
curl -L https://metabarcoding.org/obitools4/install.sh \
|
||||
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh \
|
||||
| bash
|
||||
```
|
||||
|
||||
or with options:
|
||||
|
||||
```bash
|
||||
curl -L https://metabarcoding.org/obitools4/install.sh \
|
||||
curl -L https://raw.githubusercontent.com/metabarcoding/obitools4/master/install_obitools.sh \
|
||||
| bash -s -- --install-dir test_install --obitools-prefix k
|
||||
```
|
||||
|
||||
### CPU limitation
|
||||
|
||||
- By default, *OBITools4* tries to use all the computing power available on
|
||||
your computer. In some circumstances this can be problematic (e.g. if you
|
||||
are running on a computer cluster managed by your university). You can limit
|
||||
the number of CPU cores used by *OBITools4* or by using the **--max-cpu**
|
||||
option or by setting the **OBIMAXCPU** environment variable. Some strange
|
||||
behaviour of *OBITools4* has been observed when users try to limit the
|
||||
maximum number of usable CPU cores to one. This seems to be caused by the Go
|
||||
language, and it is not obvious to get *OBITools4* to run correctly on a
|
||||
single core in all circumstances. Therefore, if you ask to use a single
|
||||
core, **OBITools4** will print a warning message and actually set this
|
||||
parameter to two cores. If you really want a single core, you can use the
|
||||
**--force-one-core** option. But be aware that this can lead to incorrect
|
||||
calculations.
|
||||
|
||||
### New features
|
||||
|
||||
- The output of the obitools will evolve to produce results only in standard
|
||||
formats such as fasta and fastq. For non-sequential data, the output will be
|
||||
in CSV format, with the separator `,`, the decimal separator `.`, and a
|
||||
header line with the column names. It is more convenient to use the output
|
||||
in other programs. For example, you can use the `csvtomd` command to
|
||||
reformat the CSV output into a Markdown table. The first command to initiate
|
||||
reformat the csv output into a markdown table. The first command to initiate
|
||||
this change is `obicount`, which now produces a 3-line CSV output.
|
||||
|
||||
```bash
|
||||
@@ -122,7 +96,7 @@ Its development is still in progress.
|
||||
database for `obitag` is to use `obipcr` on a local copy of Genbank or EMBL.
|
||||
However, these sequence databases are known to contain many taxonomic
|
||||
errors, such as bacterial sequences annotated with the taxid of their host
|
||||
species. `obicleandb` tries to detect these errors. To do this, it first keeps
|
||||
species. obicleandb tries to detect these errors. To do this, it first keeps
|
||||
only sequences annotated with the taxid to which a species, genus, and
|
||||
family taxid can be assigned. Then, for each sequence, it compares the
|
||||
distance of the sequence to the other sequences belonging to the same genus
|
||||
@@ -133,7 +107,7 @@ Its development is still in progress.
|
||||
with the p-value of the Mann-Whitney U test in the **obicleandb_trusted**
|
||||
slot. Later, the distribution of this p-value can be analyzed to determine a
|
||||
threshold. Empirically, a threshold of 0.05 is a good compromise and allows
|
||||
filtering out less than 1‰ of the sequences. These sequences can then be
|
||||
to filter out less than 1‰ of the sequences. These sequences can then be
|
||||
removed using `obigrep`.
|
||||
|
||||
- Adds a new `obijoin` utility to join information contained in a sequence
|
||||
@@ -143,16 +117,16 @@ Its development is still in progress.
|
||||
|
||||
- Adds a new tool `obidemerge` to demerge a `merge_xxx` slot by recreating the
|
||||
multiple identical sequences having the slot `xxx` recreated with its initial
|
||||
value and the sequence count set to the number of occurrences referred in the
|
||||
value and the sequence count set to the number of occurences refered in the
|
||||
`merge_xxx` slot. During the operation, the `merge_xxx` slot is removed.
|
||||
|
||||
- Adds CSV as one of the input format for every obitools command. To encode
|
||||
sequence the CSV file must include a column named `sequence` and another
|
||||
sequence the CSV file must includes a column named `sequence` and another
|
||||
column named `id`. An extra column named `qualities` can be added to specify
|
||||
the quality scores of the sequence following the same ASCII encoding than the
|
||||
the quality scores of the sequence following the same ascii encoding than the
|
||||
fastq format. All the other columns will be considered as annotations and will
|
||||
be interpreted as JSON objects encoding potentially for atomic values. If a
|
||||
column value can not be decoded as JSON it will be considered as a string.
|
||||
calumn value can not be decoded as JSON it will be considered as a string.
|
||||
|
||||
- A new option **--version** has been added to every obitools command. It will
|
||||
print the version of the command.
|
||||
@@ -161,8 +135,8 @@ Its development is still in progress.
|
||||
quality scores from a BioSequence object.\
|
||||
|
||||
- In `obimultuplex` the ngsfilter file describing the samples can be no provided
|
||||
not only using the classical ngsfilter format but also using the CSV format.
|
||||
When using CSV, the first line must contain the column names. 5 columns are
|
||||
not only using the classical nfsfilter format but also using the csv format.
|
||||
When using csv, the first line must contain the column names. 5 columns are
|
||||
expected:
|
||||
|
||||
- `experiment` the name of the experiment
|
||||
@@ -178,34 +152,43 @@ Its development is still in progress.
|
||||
|
||||
Supplementary columns are allowed. Their names and content will be used to
|
||||
annotate the sequence corresponding to the sample, as the `key=value;` did
|
||||
in the ngsfilter format.
|
||||
in the nfsfilter format.
|
||||
|
||||
The CSV format used allows for comment lines starting with `#` character.
|
||||
Special data lines starting with `@param` in the first column allow configuring the algorithm. The options **--template** provided an over
|
||||
commented example of the CSV format, including all the possible options.
|
||||
|
||||
### CPU limitation
|
||||
Special data lines starting with `@param` in the first column allow to
|
||||
configure the algorithm. The options **--template** provided an over
|
||||
commented example of the csv format, including all the possible options.
|
||||
|
||||
- By default, *OBITools4* tries to use all the computing power available on
|
||||
your computer. In some circumstances this can be problematic (e.g. if you
|
||||
are running on a computer cluster managed by your university). You can limit
|
||||
the number of CPU cores used by *OBITools4* or by using the **--max-cpu**
|
||||
option or by setting the **OBIMAXCPU** environment variable. Some strange
|
||||
behavior of *OBITools4* has been observed when users try to limit the
|
||||
maximum number of usable CPU cores to one. This seems to be caused by the Go
|
||||
language, and it is not obvious to get *OBITools4* to run correctly on a
|
||||
single core in all circumstances. Therefore, if you ask to use a single
|
||||
core, **OBITools4** will print a warning message and actually set this
|
||||
parameter to two cores. If you really want a single core, you can use the
|
||||
**--force-one-core** option. But be aware that this can lead to incorrect
|
||||
calculations.
|
||||
### Enhancement
|
||||
|
||||
- In every *OBITools* command, the progress bar are automatically deactivated
|
||||
when the standard error output is redirected.
|
||||
- Because Genbank and ENA:EMBL contain very large sequences, while OBITools4
|
||||
are optimized As Genbank and ENA:EMBL contain very large sequences, while
|
||||
OBITools4 is optimised for short sequences, `obipcr` faces some problems
|
||||
with excessive consumption of computer resources, especially memory. Several
|
||||
improvements in the tuning of the default `obipcr` parameters and some new
|
||||
features, currently only available for FASTA and FASTQ file readers, have
|
||||
been implemented to limit the memory impact of `obipcr` without changing the
|
||||
computational efficiency too much.
|
||||
- Logging system and therefore format, have been homogenized.
|
||||
|
||||
### Bug
|
||||
|
||||
- In `obitag`, correct the wrong assignment of the **obitag_bestmatch**
|
||||
attribute.
|
||||
- In `obiclean`, the **--no-progress-bar** option disables all progress bars,
|
||||
not just the data.
|
||||
- Several fixes in reading FASTA and FASTQ files, including some code
|
||||
simplification and and factorization.
|
||||
- Fixed a bug in all obitools that caused the same file to be processed
|
||||
multiple times. when specifying a directory name as input.
|
||||
|
||||
## April 2nd, 2024. Release 4.2.0
|
||||
|
||||
### New features
|
||||
|
||||
- A new OBITools named `obiscript` allows processing each sequence according
|
||||
- A new OBITools named `obiscript` allows to process each sequence according
|
||||
to a Lua script. This is an experimental tool. The **--template** option
|
||||
allows for generating an example script on the `stdout`.
|
||||
|
||||
@@ -213,7 +196,7 @@ Its development is still in progress.
|
||||
|
||||
- Two of the main class `obiseq.SeqWorker` and `obiseq.SeqWorker` have their
|
||||
declaration changed. Both now return two values a `obiseq.BioSequenceSlice`
|
||||
and an `error`. This allows a worker to return potentially several sequences
|
||||
and an `error`. This allow a worker to return potentially several sequences
|
||||
as the result of the processing of a single sequence, or zero, which is
|
||||
equivalent to filter out the input sequence.
|
||||
|
||||
@@ -221,12 +204,12 @@ Its development is still in progress.
|
||||
|
||||
- In `obitag` if the reference database contains sequences annotated by taxid
|
||||
not referenced in the taxonomy, the corresponding sequences are discarded
|
||||
from the reference database and a warning indicating the sequence *id* and the
|
||||
from the reference database and a warning indicating the sequence id and the
|
||||
wrong taxid is emitted.
|
||||
- The bug corrected in the parsing of EMBL and Genbank files as implemented in
|
||||
version 4.1.2 of OBITools4, potentially induced some reduction in the
|
||||
performance of the parsing. This should have been now fixed.
|
||||
- In the same idea, parsing of Genbank and EMBL files were reading and storing
|
||||
- In the same idea, parsing of genbank and EMBL files were reading and storing
|
||||
in memory not only the sequence but also the annotations (features table).
|
||||
Up to now none of the OBITools are using this information, but with large
|
||||
complete genomes, it is occupying a lot of memory. To reduce this impact,
|
||||
@@ -265,7 +248,7 @@ Its development is still in progress.
|
||||
|
||||
### New feature
|
||||
|
||||
- In `obimatrix` a **--transpose** option allows transposing the produced
|
||||
- In `obimatrix` a **--transpose** option allows to transpose the produced
|
||||
matrix table in CSV format.
|
||||
- In `obitpairing` and `obipcrtag` two new options **--exact-mode** and
|
||||
**--fast-absolute** to control the heuristic used in the alignment
|
||||
@@ -273,7 +256,7 @@ Its development is still in progress.
|
||||
the exact algorithm at the cost of a speed. **--fast-absolute** change the
|
||||
scoring schema of the heuristic.
|
||||
- In `obiannotate` adds the possibility to annotate the first match of a
|
||||
pattern using the same algorithm as the one used in `obipcr` and
|
||||
pattern using the same algorithm than the one used in `obipcr` and
|
||||
`obimultiplex`. For that four option were added :
|
||||
- **--pattern** : to specify the pattern. It can use IUPAC codes and
|
||||
position with no error tolerated has to be followed by a `#` character.
|
||||
@@ -354,7 +337,7 @@ Its development is still in progress.
|
||||
|
||||
### Bugs
|
||||
|
||||
- In the obitools language, the `composition` function now returns a map
|
||||
- in the obitools language, the `composition` function now returns a map
|
||||
indexed by lowercase string "a", "c", "g", "t" and "o" for other instead of
|
||||
being indexed by the ASCII codes of the corresponding letters.
|
||||
- Correction of the reverse-complement operation. Every reverse complement of
|
||||
@@ -367,18 +350,18 @@ Its development is still in progress.
|
||||
duplicating the quality values. This made `obimultiplex` to produce fastq
|
||||
files with sequences having quality values duplicated.
|
||||
|
||||
### Be careful
|
||||
### Becareful
|
||||
|
||||
GO 1.21.0 is out, and it includes new functionalities which are used in the
|
||||
OBITools4 code. If you use the recommended method for compiling OBITools on your
|
||||
computer, there is no problem, as the script always load the latest GO version.
|
||||
If you rely on your personal GO install, please think to update.
|
||||
OBITools4 code. If you use the recommanded method for compiling OBITools on your
|
||||
computer, their is no problem, as the script always load the latest GO version.
|
||||
If you rely on you personnal GO install, please think to update.
|
||||
|
||||
## August 29th, 2023. Release 4.0.5
|
||||
|
||||
### Bugs
|
||||
|
||||
- Patch a bug in the `obiseq.BioSequence` constructor leading to an error on
|
||||
- Patch a bug in the `obiseq.BioSequence` constructor leading to a error on
|
||||
almost every obitools. The error message indicates : `fatal error: sync:
|
||||
unlock of unlocked mutex` This bug was introduced in the release 4.0.4
|
||||
|
||||
@@ -397,7 +380,7 @@ If you rely on your personal GO install, please think to update.
|
||||
data structure to limit the number of alignments actually computed. This
|
||||
increase a bit the speed of both the software. `obirefidx` is nevertheless
|
||||
still too slow compared to my expectation.
|
||||
- Switch to a parallel version of the GZIP library, allowing for high speed
|
||||
- Switch to a parallel version of the gzip library, allowing for high speed
|
||||
compress and decompress operation on files.
|
||||
|
||||
### New feature
|
||||
@@ -441,12 +424,12 @@ If you rely on your personal GO install, please think to update.
|
||||
--unidentified not_assigned.fastq
|
||||
```
|
||||
|
||||
The command produced four files : `tagged_library_R1.fastq` and
|
||||
the command produced four files : `tagged_library_R1.fastq` and
|
||||
`tagged_library_R2.fastq` containing the assigned reads and
|
||||
`not_assigned_R1.fastq` and `not_assigned_R2.fastq` containing the
|
||||
unassignable reads.
|
||||
|
||||
The tagged library files can then be split using `obidistribute`:
|
||||
the tagged library files can then be split using `obidistribute`:
|
||||
|
||||
```{bash}
|
||||
mkdir pcr_reads
|
||||
@@ -456,9 +439,9 @@ If you rely on your personal GO install, please think to update.
|
||||
|
||||
- Adding of two options **--add-lca-in** and **--lca-error** to `obiannotate`.
|
||||
These options aim to help during construction of reference database using
|
||||
`obipcr`. On `obipcr` output, it is commonly run `obiuniq`. To merge identical
|
||||
`obipcr`. On obipcr output, it is commonly run obiuniq. To merge identical
|
||||
sequences annotated with different taxids, it is now possible to use the
|
||||
following strategies :
|
||||
following strategie :
|
||||
|
||||
```{bash}
|
||||
obiuniq -m taxid myrefdb.obipcr.fasta \
|
||||
@@ -489,7 +472,7 @@ If you rely on your personal GO install, please think to update.
|
||||
- Correction of a bug in `obiconsensus` leading into the deletion of a base
|
||||
close to the beginning of the consensus sequence.
|
||||
|
||||
## March 31st, 2023. Release 4.0.2
|
||||
## March 31th, 2023. Release 4.0.2
|
||||
|
||||
### Compiler change
|
||||
|
||||
@@ -500,15 +483,15 @@ If you rely on your personal GO install, please think to update.
|
||||
- Add the possibility for looking pattern with indels. This has been added to
|
||||
`obimultiplex` through the **--with-indels** option.
|
||||
- Every obitools command has a **--pprof** option making the command
|
||||
publishing a profiling website available at the address :
|
||||
publishing a profiling web site available at the address :
|
||||
<http://localhost:8080/debug/pprof/>
|
||||
- A new `obiconsensus` command has been added. It is a prototype. It aims to
|
||||
build a consensus sequence from a set of reads. The consensus is estimated
|
||||
for all the sequences contained in the input file. If several input files,
|
||||
or a directory name are provided the result contains a consensus per file.
|
||||
The *id* of the sequence is the name of the input file depleted of its
|
||||
The id of the sequence is the name of the input file depleted of its
|
||||
directory name and of all its extensions.
|
||||
- In `obipcr` an experimental option **--fragmented** allows for splitting very
|
||||
- In `obipcr` an experimental option **--fragmented** allows for spliting very
|
||||
long query sequences into shorter fragments with an overlap between the two
|
||||
contiguous fragment insuring that no amplicons are missed despite the split.
|
||||
As a site effect some amplicon can be identified twice.
|
||||
@@ -551,7 +534,7 @@ If you rely on your personal GO install, please think to update.
|
||||
### Enhancement
|
||||
|
||||
- *OBITools* are automatically processing all the sequences files contained in
|
||||
a directory and its subdirectory\
|
||||
a directory and its sub-directory\
|
||||
recursively if its name is provided as input. To process easily Genbank
|
||||
files, the corresponding filename extensions have been added. Today the
|
||||
following extensions are recognized as sequence files : `.fasta`, `.fastq`,
|
||||
@@ -568,7 +551,7 @@ If you rely on your personal GO install, please think to update.
|
||||
export OBICPUMAX=4
|
||||
```
|
||||
|
||||
- Adds a new option --out\|-o allowing to specify the name of an output file.
|
||||
- Adds a new option --out\|-o allowing to specify the name of an outpout file.
|
||||
|
||||
``` bash
|
||||
obiconvert -o xyz.fasta xxx.fastq
|
||||
@@ -590,10 +573,10 @@ If you rely on your personal GO install, please think to update.
|
||||
matched files remain consistent when processed.
|
||||
|
||||
- Adding of the function `ifelse` to the expression language for computing
|
||||
conditional values.
|
||||
conditionnal values.
|
||||
|
||||
- Adding two function to the expression language related to sequence
|
||||
composition : `composition` and `gcskew`. Both are taking a sequence as
|
||||
conposition : `composition` and `gcskew`. Both are taking a sequence as
|
||||
single argument.
|
||||
|
||||
## February 18th, 2023. Release 4.0.0
|
||||
@@ -601,8 +584,8 @@ If you rely on your personal GO install, please think to update.
|
||||
It is the first version of the *OBITools* version 4. I decided to tag then
|
||||
following two weeks of intensive data analysis with them allowing to discover
|
||||
many small bugs present in the previous non-official version. Obviously other
|
||||
bugs are certainly present in the code, and you are welcome to use the git
|
||||
ticket system to mention them. But they seem to produce now reliable results.
|
||||
bugs are certainly persent in the code, and you are welcome to use the git
|
||||
ticket system to mention them. But they seems to produce now reliable results.
|
||||
|
||||
### Corrected bugs
|
||||
|
||||
@@ -610,11 +593,11 @@ ticket system to mention them. But they seem to produce now reliable results.
|
||||
of sequences and to the production of incorrect file because of the last
|
||||
sequence record, sometime truncated in its middle. This was only occurring
|
||||
when more than a single CPU was used. It was affecting every obitools.
|
||||
- The `obiparing` software had a bug in the right alignment procedure. This led
|
||||
to the non-alignment of very sort barcode during the paring of the forward
|
||||
- The `obiparing` software had a bug in the right aligment procedure. This led
|
||||
to the non alignment of very sort barcode during the paring of the forward
|
||||
and reverse reads.
|
||||
- The `obipairing` tools had a non-deterministic comportment when aligning a
|
||||
pair very low quality reads. This induced that the result of the same low
|
||||
- The `obipairing` tools had a non deterministic comportment when aligning a
|
||||
paor very low quality reads. This induced that the result of the same low
|
||||
quality read pair was not the same from run to run.
|
||||
|
||||
### New features
|
||||
@@ -622,10 +605,11 @@ ticket system to mention them. But they seem to produce now reliable results.
|
||||
- Adding of a `--compress|-Z` option to every obitools allowing to produce
|
||||
`gz` compressed output. OBITools were already able to deal with gziped input
|
||||
files transparently. They can now produce their results in the same format.
|
||||
- Adding of a `--append|-A` option to the `obidistribute` tool. It allows appending the result of an `obidistribute` execution to preexisting files. -
|
||||
- Adding of a `--append|-A` option to the `obidistribute` tool. It allows to
|
||||
append the result of an `obidistribute` execution to preexisting files. -
|
||||
Adding of a `--directory|-d` option to the `obidistribute` tool. It allows
|
||||
declaring a secondary classification key over the one defined by the
|
||||
`--category\|-c\` option. This extra key leads to produce directories in
|
||||
to declare a secondary classification key over the one defined by the
|
||||
'--category\|-c\` option. This extra key leads to produce directories in
|
||||
which files produced according to the primary criterion are stored.
|
||||
- Adding of the functions `subspc`, `printf`, `int`, `numeric`, and `bool` to
|
||||
the expression language.
|
||||
42
cmd/obitools/obimicroasm/main.go
Normal file
42
cmd/obitools/obimicroasm/main.go
Normal file
@@ -0,0 +1,42 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obioptions"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obimicroasm"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
func main() {
|
||||
|
||||
// go tool pprof -http=":8000" ./obipairing ./cpu.pprof
|
||||
// f, err := os.Create("cpu.pprof")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// pprof.StartCPUProfile(f)
|
||||
// defer pprof.StopCPUProfile()
|
||||
|
||||
// go tool trace cpu.trace
|
||||
// ftrace, err := os.Create("cpu.trace")
|
||||
// if err != nil {
|
||||
// log.Fatal(err)
|
||||
// }
|
||||
// trace.Start(ftrace)
|
||||
// defer trace.Stop()
|
||||
|
||||
optionParser := obioptions.GenerateOptionParser(obimicroasm.OptionSet)
|
||||
|
||||
optionParser(os.Args)
|
||||
|
||||
obidefault.SetStrictReadWorker(2)
|
||||
obidefault.SetStrictWriteWorker(2)
|
||||
|
||||
seq := obimicroasm.CLIAssemblePCR()
|
||||
|
||||
println(obiformats.FormatFasta(seq, obiformats.FormatFastSeqJsonHeader))
|
||||
obiutils.WaitForLastPipe()
|
||||
}
|
||||
12
go.mod
12
go.mod
@@ -5,9 +5,7 @@ go 1.23.1
|
||||
require (
|
||||
github.com/DavidGamba/go-getoptions v0.28.0
|
||||
github.com/PaesslerAG/gval v1.2.2
|
||||
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9
|
||||
github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df
|
||||
github.com/buger/jsonparser v1.1.1
|
||||
github.com/chen3feng/stl4go v0.1.1
|
||||
github.com/dlclark/regexp2 v1.11.4
|
||||
github.com/goccy/go-json v0.10.3
|
||||
@@ -16,7 +14,7 @@ require (
|
||||
github.com/rrethy/ahocorasick v1.0.0
|
||||
github.com/schollz/progressbar/v3 v3.13.1
|
||||
github.com/sirupsen/logrus v1.9.3
|
||||
github.com/stretchr/testify v1.8.4
|
||||
github.com/stretchr/testify v1.10.0
|
||||
github.com/tevino/abool/v2 v2.1.0
|
||||
github.com/yuin/gopher-lua v1.1.1
|
||||
golang.org/x/exp v0.0.0-20231006140011-7918f672742d
|
||||
@@ -26,12 +24,18 @@ require (
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/Clever/csvlint v0.3.0 // indirect
|
||||
github.com/TuftsBCB/io v0.0.0-20140121014543-22b94e9b23f9 // indirect
|
||||
github.com/buger/jsonparser v1.1.1 // indirect
|
||||
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/ef-ds/deque/v2 v2.0.2 // indirect
|
||||
github.com/goombaio/orderedmap v0.0.0-20180924084748-ba921b7e2419 // indirect
|
||||
github.com/kr/pretty v0.3.0 // indirect
|
||||
github.com/kr/text v0.2.0 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/rogpeppe/go-internal v1.6.1 // indirect
|
||||
go.etcd.io/bbolt v1.4.0 // indirect
|
||||
)
|
||||
|
||||
require (
|
||||
@@ -45,7 +49,7 @@ require (
|
||||
github.com/shopspring/decimal v1.3.1 // indirect
|
||||
github.com/ulikunitz/xz v0.5.11
|
||||
golang.org/x/net v0.17.0 // indirect
|
||||
golang.org/x/sys v0.17.0 // indirect
|
||||
golang.org/x/sys v0.29.0 // indirect
|
||||
golang.org/x/term v0.13.0 // indirect
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c
|
||||
)
|
||||
|
||||
13
go.sum
13
go.sum
@@ -1,3 +1,5 @@
|
||||
github.com/Clever/csvlint v0.3.0 h1:58WEFXWy+i0fCbxTXscR2QwYESRuAUFjEGLgZs6j2iU=
|
||||
github.com/Clever/csvlint v0.3.0/go.mod h1:+wLRuW/bI8NhpRoeyUBxqKsK35OhvgJhXHSWdKp5XJU=
|
||||
github.com/DavidGamba/go-getoptions v0.28.0 h1:18wgEvfZdrlfIhVDGEBO3Dl0fkOyXqXLa0tLMCKxM1c=
|
||||
github.com/DavidGamba/go-getoptions v0.28.0/go.mod h1:zE97E3PR9P3BI/HKyNYgdMlYxodcuiC6W68KIgeYT84=
|
||||
github.com/PaesslerAG/gval v1.2.2 h1:Y7iBzhgE09IGTt5QgGQ2IdaYYYOU134YGHBThD+wm9E=
|
||||
@@ -12,6 +14,8 @@ github.com/buger/jsonparser v1.1.1 h1:2PnMjfWD7wBILjqQbt530v576A/cAbQvEW9gGIpYMU
|
||||
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
|
||||
github.com/chen3feng/stl4go v0.1.1 h1:0L1+mDw7pomftKDruM23f1mA7miavOj6C6MZeadzN2Q=
|
||||
github.com/chen3feng/stl4go v0.1.1/go.mod h1:5ml3psLgETJjRJnMbPE+JiHLrCpt+Ajc2weeTECXzWU=
|
||||
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5OhCuC+XN+r/bBCmeuuJtjz+bCNIf8=
|
||||
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
@@ -21,6 +25,8 @@ github.com/dlclark/regexp2 v1.11.4/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cn
|
||||
github.com/dsnet/compress v0.0.1 h1:PlZu0n3Tuv04TzpfPbrnI0HW/YwodEXDS+oPKahKF0Q=
|
||||
github.com/dsnet/compress v0.0.1/go.mod h1:Aw8dCMJ7RioblQeTqt88akK31OvO8Dhf5JflhBbQEHo=
|
||||
github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdfkVLjJ8T6VcRQv3SXugXy999NBtR9aFY=
|
||||
github.com/ef-ds/deque/v2 v2.0.2 h1:GQtDK1boBMu/qsNbSLQsqzwNptaioxZI39X3UxT5ALA=
|
||||
github.com/ef-ds/deque/v2 v2.0.2/go.mod h1:hoZy4VooWLhRT4uS+sSCilfgBQUNptJU2FGqr08a5sc=
|
||||
github.com/gabriel-vasile/mimetype v1.4.3 h1:in2uUcidCuFcDKtdcBxlR0rJ1+fsokWf+uqxgUFjbI0=
|
||||
github.com/gabriel-vasile/mimetype v1.4.3/go.mod h1:d8uq/6HKRL6CGdk+aubisF/M5GcPfT7nKyLpA0lbSSk=
|
||||
github.com/goccy/go-json v0.10.3 h1:KZ5WoDbxAIgm2HNbYckL0se1fHD6rz5j4ywS6ebzDqA=
|
||||
@@ -69,9 +75,12 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ
|
||||
github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
|
||||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
|
||||
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
|
||||
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
|
||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
|
||||
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
github.com/tevino/abool/v2 v2.1.0 h1:7w+Vf9f/5gmKT4m4qkayb33/92M+Um45F2BkHOR+L/c=
|
||||
github.com/tevino/abool/v2 v2.1.0/go.mod h1:+Lmlqk6bHDWHqN1cbxqhwEAwMPXgc8I1SDEamtseuXY=
|
||||
github.com/ulikunitz/xz v0.5.6/go.mod h1:2bypXElzHzzJZwzH67Y6wb67pO62Rzfn7BSiF4ABRW8=
|
||||
@@ -79,6 +88,8 @@ github.com/ulikunitz/xz v0.5.11 h1:kpFauv27b6ynzBNT/Xy+1k+fK4WswhN/6PN5WhFAGw8=
|
||||
github.com/ulikunitz/xz v0.5.11/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
|
||||
github.com/yuin/gopher-lua v1.1.1 h1:kYKnWBjvbNP4XLT3+bPEwAXJx262OhaHDWDVOPjL46M=
|
||||
github.com/yuin/gopher-lua v1.1.1/go.mod h1:GBR0iDaNXjAgGg9zfCvksxSRnQx76gclCIb7kdAd1Pw=
|
||||
go.etcd.io/bbolt v1.4.0 h1:TU77id3TnN/zKr7CO/uk+fBCwF2jGcMuw2B/FMAzYIk=
|
||||
go.etcd.io/bbolt v1.4.0/go.mod h1:AsD+OCi/qPN1giOX1aiLAha3o1U8rAz65bvN4j0sRuk=
|
||||
golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
|
||||
golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
|
||||
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
|
||||
@@ -88,6 +99,8 @@ golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBc
|
||||
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y=
|
||||
golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/sys v0.29.0 h1:TPYlXGxvx1MGTn2GiZDhnjPA9wZzZeGKHHmKhHYvgaU=
|
||||
golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
|
||||
golang.org/x/term v0.13.0 h1:bb+I9cTfFazGW51MZqBVmZy7+JEJMouUHTUSKVQLBek=
|
||||
golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U=
|
||||
|
||||
@@ -2,12 +2,9 @@ git.sr.ht/~sbinet/gg v0.3.1 h1:LNhjNn8DerC8f9DHLz6lS0YYul/b602DUxDgGkd/Aik=
|
||||
git.sr.ht/~sbinet/gg v0.3.1/go.mod h1:KGYtlADtqsqANL9ueOFkWymvzUvLMQllU5Ixo+8v3pc=
|
||||
github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b h1:slYM766cy2nI3BwyRiyQj/Ud48djTMtMebDqepE95rw=
|
||||
github.com/ajstarks/svgo v0.0.0-20211024235047-1546f124cd8b/go.mod h1:1KcenG0jGWcpt8ov532z81sp/kMMUG485J2InIOyADM=
|
||||
github.com/buger/jsonparser v1.1.1/go.mod h1:6RYKKt7H4d4+iWqouImQ9R2FZql3VbhNgx27UK13J/0=
|
||||
github.com/chzyer/logex v1.1.10 h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE=
|
||||
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
|
||||
github.com/chzyer/logex v1.2.0 h1:+eqR0HfOetur4tgnC8ftU5imRnhi4te+BadWS95c5AM=
|
||||
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5OhCuC+XN+r/bBCmeuuJtjz+bCNIf8=
|
||||
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
|
||||
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8=
|
||||
github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
|
||||
github.com/chzyer/test v0.0.0-20210722231415-061457976a23 h1:dZ0/VyGgQdVGAss6Ju0dt5P0QltE0SFY5Woh6hbIfiQ=
|
||||
@@ -29,16 +26,21 @@ github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN
|
||||
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
|
||||
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
|
||||
github.com/ianlancetaylor/demangle v0.0.0-20220319035150-800ac71e25c2 h1:rcanfLhLDA8nozr/K289V1zcntHr3V+SHlXwzz1ZI2g=
|
||||
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
|
||||
github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213 h1:qGQQKEcAR99REcMpsXCp3lJ03zYT1PkRd3kQGPn9GVg=
|
||||
github.com/klauspost/cpuid v1.2.0 h1:NMpwD2G9JSFOE1/TJjGSo5zG7Yb2bTe7eq1jH+irmeE=
|
||||
github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw=
|
||||
github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
|
||||
github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng=
|
||||
github.com/smallnest/goroutine v1.1.1/go.mod h1:Fp8f6ZReubfdj0m4+NcUnW4IsAqKa+Pnrv9opEiD43E=
|
||||
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
|
||||
github.com/spf13/pflag v1.0.6/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
|
||||
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
|
||||
github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
|
||||
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
|
||||
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
|
||||
github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE=
|
||||
go.etcd.io/gofail v0.2.0/go.mod h1:nL3ILMGfkXTekKI3clMBNazKnjUZjYLKmBHzsVAnC1o=
|
||||
golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
|
||||
golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
|
||||
golang.org/x/image v0.6.0 h1:bR8b5okrPI3g/gyZakLZHeWxAR8Dn5CyxXv1hLH5g/4=
|
||||
@@ -46,6 +48,7 @@ golang.org/x/image v0.6.0/go.mod h1:MXLdDR43H7cDJq5GEGXEVeeNhPgi+YYEQ2pC1byI1x0=
|
||||
golang.org/x/mod v0.13.0 h1:I/DsJXRlw/8l/0c24sM9yb0T4z9liZTduXvdAWYiysY=
|
||||
golang.org/x/mod v0.13.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
|
||||
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 h1:uVc8UZUe6tr40fFVnUP5Oj+veunVezqYl9z7DYw9xzw=
|
||||
golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
|
||||
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
|
||||
golang.org/x/tools v0.14.0 h1:jvNa2pY0M4r62jkRQ6RwEZZyPcymeL9XZMLBbV7U2nc=
|
||||
|
||||
@@ -1,144 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Here give the name of the test serie
|
||||
#
|
||||
|
||||
TEST_NAME=obicount
|
||||
|
||||
######
|
||||
#
|
||||
# Some variable and function definitions: please don't change them
|
||||
#
|
||||
######
|
||||
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
|
||||
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
|
||||
export PATH="${OBITOOLS_DIR}:${PATH}"
|
||||
|
||||
|
||||
TMPDIR="$(mktemp -d)"
|
||||
ntest=0
|
||||
success=0
|
||||
failed=0
|
||||
|
||||
cleanup() {
|
||||
echo "========================================" 1>&2
|
||||
echo "## Results of the $TEST_NAME tests:" 1>&2
|
||||
|
||||
echo 1>&2
|
||||
echo "- $ntest tests run" 1>&2
|
||||
echo "- $success successfully completed" 1>&2
|
||||
echo "- $failed failed tests" 1>&2
|
||||
echo 1>&2
|
||||
echo "Cleaning up the temporary directory..." 1>&2
|
||||
echo 1>&2
|
||||
echo "========================================" 1>&2
|
||||
|
||||
rm -rf "$TMPDIR" # Suppress the temporary directory
|
||||
|
||||
if [ $failed -gt 0 ]; then
|
||||
log "$TEST_NAME tests failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
}
|
||||
|
||||
log() {
|
||||
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
||||
}
|
||||
|
||||
log "Testing $TEST_NAME..."
|
||||
log "Test directory is $TEST_DIR"
|
||||
log "obitools directory is $OBITOOLS_DIR"
|
||||
log "Temporary directory is $TMPDIR"
|
||||
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||
|
||||
######################################################################
|
||||
####
|
||||
#### Below are the tests
|
||||
####
|
||||
#### Before each test :
|
||||
#### - increment the variable ntest
|
||||
####
|
||||
#### Run the command as the condition of an if / then /else
|
||||
#### - The command must return 0 on success
|
||||
#### - The command must return an exit code different from 0 on failure
|
||||
#### - The datafiles are stored in the same directory than the test script
|
||||
#### - The test script directory is stored in the TEST_DIR variable
|
||||
#### - If result files have to be produced they must be stored
|
||||
#### in the temporary directory (TMPDIR variable)
|
||||
####
|
||||
#### then clause is executed on success of the command
|
||||
#### - Write a success message using the log function
|
||||
#### - increment the variable success
|
||||
####
|
||||
#### else clause is executed on failure of the command
|
||||
#### - Write a failure message using the log function
|
||||
#### - increment the variable failed
|
||||
####
|
||||
######################################################################
|
||||
|
||||
((ntest++))
|
||||
if obicount "${TEST_DIR}/wolf_F.fasta.gz" \
|
||||
> "${TMPDIR}/wolf_F.fasta_count.csv"
|
||||
then
|
||||
log "OBICount: fasta reading OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: fasta reading failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obicount "${TEST_DIR}/wolf_F.fastq.gz" \
|
||||
> "${TMPDIR}/wolf_F.fastq_count.csv"
|
||||
then
|
||||
log "OBICount: fastq reading OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: fastq reading failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obicount "${TEST_DIR}/wolf_F.csv.gz" \
|
||||
> "${TMPDIR}/wolf_F.csv_count.csv"
|
||||
then
|
||||
log "OBICount: csv reading OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: csv reading failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if diff "${TMPDIR}/wolf_F.fasta_count.csv" \
|
||||
"${TMPDIR}/wolf_F.fastq_count.csv" > /dev/null
|
||||
then
|
||||
log "OBICount: counting on fasta and fastq are identical OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: counting on fasta and fastq are different failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if diff "${TMPDIR}/wolf_F.fasta_count.csv" \
|
||||
"${TMPDIR}/wolf_F.csv_count.csv" > /dev/null
|
||||
then
|
||||
log "OBICount: counting on fasta and csv are identical OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBICount: counting on fasta and csv are different failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
#########################################
|
||||
#
|
||||
# At the end of the tests
|
||||
# the cleanup function is called
|
||||
#
|
||||
#########################################
|
||||
|
||||
cleanup
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -1,134 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Here give the name of the test serie
|
||||
#
|
||||
|
||||
TEST_NAME=obiparing
|
||||
|
||||
######
|
||||
#
|
||||
# Some variable and function definitions: please don't change them
|
||||
#
|
||||
######
|
||||
TEST_DIR="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")"
|
||||
OBITOOLS_DIR="${TEST_DIR/obitest*/}build"
|
||||
export PATH="${OBITOOLS_DIR}:${PATH}"
|
||||
|
||||
|
||||
TMPDIR="$(mktemp -d)"
|
||||
ntest=0
|
||||
success=0
|
||||
failed=0
|
||||
|
||||
cleanup() {
|
||||
echo "========================================" 1>&2
|
||||
echo "## Results of the $TEST_NAME tests:" 1>&2
|
||||
|
||||
echo 1>&2
|
||||
echo "- $ntest tests run" 1>&2
|
||||
echo "- $success successfully completed" 1>&2
|
||||
echo "- $failed failed tests" 1>&2
|
||||
echo 1>&2
|
||||
echo "Cleaning up the temporary directory..." 1>&2
|
||||
echo 1>&2
|
||||
echo "========================================" 1>&2
|
||||
|
||||
rm -rf "$TMPDIR" # Suppress the temporary directory
|
||||
|
||||
if [ $failed -gt 0 ]; then
|
||||
log "$TEST_NAME tests failed"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
exit 0
|
||||
}
|
||||
|
||||
log() {
|
||||
echo -e "[$TEST_NAME @ $(date)] $*" 1>&2
|
||||
}
|
||||
|
||||
log "Testing $TEST_NAME..."
|
||||
log "Test directory is $TEST_DIR"
|
||||
log "obitools directory is $OBITOOLS_DIR"
|
||||
log "Temporary directory is $TMPDIR"
|
||||
log "files: $(find $TEST_DIR | awk -F'/' '{print $NF}' | tail -n +2)"
|
||||
|
||||
######################################################################
|
||||
####
|
||||
#### Below are the tests
|
||||
####
|
||||
#### Before each test :
|
||||
#### - increment the variable ntest
|
||||
####
|
||||
#### Run the command as the condition of an if / then /else
|
||||
#### - The command must return 0 on success
|
||||
#### - The command must return an exit code different from 0 on failure
|
||||
#### - The datafiles are stored in the same directory than the test script
|
||||
#### - The test script directory is stored in the TEST_DIR variable
|
||||
#### - If result files have to be produced they must be stored
|
||||
#### in the temporary directory (TMPDIR variable)
|
||||
####
|
||||
#### then clause is executed on success of the command
|
||||
#### - Write a success message using the log function
|
||||
#### - increment the variable success
|
||||
####
|
||||
#### else clause is executed on failure of the command
|
||||
#### - Write a failure message using the log function
|
||||
#### - increment the variable failed
|
||||
####
|
||||
######################################################################
|
||||
|
||||
((ntest++))
|
||||
if obipairing -F "${TEST_DIR}/wolf_F.fastq.gz" \
|
||||
-R "${TEST_DIR}/wolf_R.fastq.gz" \
|
||||
| obidistribute -Z -c mode \
|
||||
-p "${TMPDIR}/wolf_paired_%s.fastq.gz"
|
||||
then
|
||||
log "OBIPairing: sequence pairing OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIPairing: sequence pairing failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obicsv -Z -s -i \
|
||||
-k ali_dir -k ali_length -k paring_fast_count \
|
||||
-k paring_fast_overlap -k paring_fast_score \
|
||||
-k score -k score_norm -k seq_a_single \
|
||||
-k seq_b_single -k seq_ab_match \
|
||||
"${TMPDIR}/wolf_paired_alignment.fastq.gz" \
|
||||
> "${TMPDIR}/wolf_paired_alignment.csv.gz" \
|
||||
&& zdiff -c "${TEST_DIR}/wolf_paired_alignment.csv.gz" \
|
||||
"${TMPDIR}/wolf_paired_alignment.csv.gz"
|
||||
then
|
||||
log "OBIPairing: check aligned sequences OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIPairing: check aligned sequences failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
((ntest++))
|
||||
if obicsv -Z -s -i \
|
||||
"${TMPDIR}/wolf_paired_join.fastq.gz" \
|
||||
> "${TMPDIR}/wolf_paired_join.csv.gz" \
|
||||
&& zdiff -c "${TEST_DIR}/wolf_paired_join.csv.gz" \
|
||||
"${TMPDIR}/wolf_paired_join.csv.gz"
|
||||
then
|
||||
log "OBIPairing: check joined sequences OK"
|
||||
((success++))
|
||||
else
|
||||
log "OBIPairing: check joined sequences failed"
|
||||
((failed++))
|
||||
fi
|
||||
|
||||
#########################################
|
||||
#
|
||||
# At the end of the tests
|
||||
# the cleanup function is called
|
||||
#
|
||||
#########################################
|
||||
|
||||
cleanup
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -10,7 +10,6 @@ import (
|
||||
"strings"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
// // A pool of byte slices.
|
||||
@@ -159,30 +158,12 @@ func BuildQualityConsensus(seqA, seqB *obiseq.BioSequence, path []int, statOnMis
|
||||
|
||||
match := 0
|
||||
|
||||
left := obiutils.Abs(path[0])
|
||||
right := 0
|
||||
if path[len(path)-1] == 0 {
|
||||
right = path[len(path)-2]
|
||||
}
|
||||
|
||||
right = obiutils.Abs(right)
|
||||
|
||||
right = len(*bufferQA) - right
|
||||
|
||||
// log.Warnf("BuildQualityConsensus: left = %d right = %d\n", left, right)
|
||||
|
||||
for i, qA = range *bufferQA {
|
||||
nA := (*bufferSA)[i]
|
||||
nB := (*bufferSB)[i]
|
||||
qB = (*bufferQB)[i]
|
||||
|
||||
if statOnMismatch && i >= left && i < right && nA != nB {
|
||||
if nA == ' ' {
|
||||
nA = '-'
|
||||
}
|
||||
if nB == ' ' {
|
||||
nB = '-'
|
||||
}
|
||||
if statOnMismatch && nA != nB && nA != ' ' && nB != ' ' {
|
||||
mismatches[strings.ToUpper(fmt.Sprintf("(%c:%02d)->(%c:%02d)", nA, qA, nB, qB))] = i + 1
|
||||
}
|
||||
|
||||
@@ -202,12 +183,13 @@ func BuildQualityConsensus(seqA, seqB *obiseq.BioSequence, path []int, statOnMis
|
||||
|
||||
q := qA + qB
|
||||
|
||||
if nA != nB {
|
||||
q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/40))*10+0.5)
|
||||
}
|
||||
|
||||
if nA == nB {
|
||||
match++
|
||||
if qA > 0 && qB > 0 {
|
||||
if nA != nB {
|
||||
q = qM - byte(math.Log10(1-math.Pow(10, -float64(qm)/30))*10+0.5)
|
||||
}
|
||||
if nA == nB {
|
||||
match++
|
||||
}
|
||||
}
|
||||
|
||||
if q > 90 {
|
||||
|
||||
@@ -74,30 +74,6 @@ func _Logaddexp(a, b float64) float64 {
|
||||
return b + math.Log1p(math.Exp(a-b))
|
||||
}
|
||||
|
||||
func _Log1mexp(a float64) float64 {
|
||||
if a > 0 {
|
||||
log.Panic("Log1mexp: a > 0")
|
||||
}
|
||||
|
||||
if a == 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
return (math.Log(-math.Expm1(a)))
|
||||
}
|
||||
|
||||
func _Logdiffexp(a, b float64) float64 {
|
||||
if a < b {
|
||||
log.Panic("Log1mexp: a < b")
|
||||
}
|
||||
|
||||
if a == b {
|
||||
return math.Inf(-1)
|
||||
}
|
||||
|
||||
return a + _Log1mexp(b-a)
|
||||
}
|
||||
|
||||
// _MatchScoreRatio calculates the match score ratio between two bytes.
|
||||
//
|
||||
// Parameters:
|
||||
@@ -107,25 +83,25 @@ func _Logdiffexp(a, b float64) float64 {
|
||||
// Returns:
|
||||
// - float64: the match score ratio when a match is observed
|
||||
// - float64: the match score ratio when a mismatch is observed
|
||||
func _MatchScoreRatio(QF, QR byte) (float64, float64) {
|
||||
func _MatchScoreRatio(a, b byte) (float64, float64) {
|
||||
|
||||
l2 := math.Log(2)
|
||||
l3 := math.Log(3)
|
||||
l4 := math.Log(4)
|
||||
l10 := math.Log(10)
|
||||
qF := -float64(QF) / 10 * l10
|
||||
qR := -float64(QR) / 10 * l10
|
||||
term1 := _Logaddexp(qF, qR)
|
||||
term2 := _Logdiffexp(term1, qF+qR)
|
||||
lalea := math.Log(4) // 1 /(change of the random model)
|
||||
lE1 := -float64(a)/10*l10 - l3 // log proba of sequencing error on A/3
|
||||
lE2 := -float64(b)/10*l10 - l3 // log proba of sequencing error on B/3
|
||||
lO1 := math.Log1p(-math.Exp(lE1 + l3)) // log proba no being an error on A
|
||||
lO2 := math.Log1p(-math.Exp(lE2 + l3)) // log proba no being an error on B
|
||||
lO1O2 := lO1 + lO2
|
||||
lE1E2 := lE1 + lE2
|
||||
lO1E2 := lO1 + lE2
|
||||
lO2E1 := lO2 + lE1
|
||||
|
||||
// log.Warnf("MatchScoreRatio: %v, %v , %v, %v", QF, QR, term1, term2)
|
||||
MM := _Logaddexp(lO1O2, lE1E2+l3) // Proba match when match observed
|
||||
Mm := _Logaddexp(_Logaddexp(lO1E2, lO2E1), lE1E2+l2) // Proba match when mismatch observed
|
||||
|
||||
match_logp := _Log1mexp(term2 + l3 - l4)
|
||||
match_score := match_logp - _Log1mexp(match_logp)
|
||||
|
||||
mismatch_logp := term2 - l4
|
||||
mismatch_score := mismatch_logp - _Log1mexp(mismatch_logp)
|
||||
|
||||
return match_score, mismatch_score
|
||||
return MM + lalea, Mm + lalea
|
||||
}
|
||||
|
||||
func _InitNucPartMatch() {
|
||||
|
||||
@@ -21,15 +21,15 @@ func encodeValues(score, length int, out bool) uint64 {
|
||||
return fo
|
||||
}
|
||||
|
||||
// func _isout(value uint64) bool {
|
||||
// const outmask = uint64(1) << dwsize
|
||||
// return (value & outmask) == 0
|
||||
// }
|
||||
func _isout(value uint64) bool {
|
||||
const outmask = uint64(1) << dwsize
|
||||
return (value & outmask) == 0
|
||||
}
|
||||
|
||||
// func _lpath(value uint64) int {
|
||||
// const mask = uint64(1<<wsize) - 1
|
||||
// return int(((value + 1) ^ mask) & mask)
|
||||
// }
|
||||
func _lpath(value uint64) int {
|
||||
const mask = uint64(1<<wsize) - 1
|
||||
return int(((value + 1) ^ mask) & mask)
|
||||
}
|
||||
|
||||
func decodeValues(value uint64) (int, int, bool) {
|
||||
const mask = uint64(1<<wsize) - 1
|
||||
@@ -57,3 +57,4 @@ func _setout(value uint64) uint64 {
|
||||
var _empty = encodeValues(0, 0, false)
|
||||
var _out = encodeValues(0, 30000, true)
|
||||
var _notavail = encodeValues(0, 30000, false)
|
||||
|
||||
|
||||
@@ -625,8 +625,6 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
|
||||
&arena.pointer.scoreMatrix,
|
||||
&arena.pointer.pathMatrix)
|
||||
|
||||
score = scoreR
|
||||
|
||||
path = _Backtracking(arena.pointer.pathMatrix,
|
||||
len(rawSeqA), len(rawSeqB),
|
||||
&(arena.pointer.path))
|
||||
@@ -643,7 +641,6 @@ func PEAlign(seqA, seqB *obiseq.BioSequence,
|
||||
len(rawSeqA), len(rawSeqB),
|
||||
&(arena.pointer.path))
|
||||
isLeftAlign = true
|
||||
score = scoreL
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ import (
|
||||
"github.com/buger/jsonparser"
|
||||
)
|
||||
|
||||
func _parse_json_map_string(str []byte) (map[string]string, error) {
|
||||
func _parse_json_map_string(str []byte, sequence *obiseq.BioSequence) (map[string]string, error) {
|
||||
values := make(map[string]string)
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -25,7 +25,7 @@ func _parse_json_map_string(str []byte) (map[string]string, error) {
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_map_int(str []byte) (map[string]int, error) {
|
||||
func _parse_json_map_int(str []byte, sequence *obiseq.BioSequence) (map[string]int, error) {
|
||||
values := make(map[string]int)
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -41,7 +41,7 @@ func _parse_json_map_int(str []byte) (map[string]int, error) {
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_map_float(str []byte) (map[string]float64, error) {
|
||||
func _parse_json_map_float(str []byte, sequence *obiseq.BioSequence) (map[string]float64, error) {
|
||||
values := make(map[string]float64)
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -57,7 +57,7 @@ func _parse_json_map_float(str []byte) (map[string]float64, error) {
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_map_bool(str []byte) (map[string]bool, error) {
|
||||
func _parse_json_map_bool(str []byte, sequence *obiseq.BioSequence) (map[string]bool, error) {
|
||||
values := make(map[string]bool)
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -73,7 +73,7 @@ func _parse_json_map_bool(str []byte) (map[string]bool, error) {
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_map_interface(str []byte) (map[string]interface{}, error) {
|
||||
func _parse_json_map_interface(str []byte, sequence *obiseq.BioSequence) (map[string]interface{}, error) {
|
||||
values := make(map[string]interface{})
|
||||
jsonparser.ObjectEach(str,
|
||||
func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) (err error) {
|
||||
@@ -100,7 +100,7 @@ func _parse_json_map_interface(str []byte) (map[string]interface{}, error) {
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_array_string(str []byte) ([]string, error) {
|
||||
func _parse_json_array_string(str []byte, sequence *obiseq.BioSequence) ([]string, error) {
|
||||
values := make([]string, 0)
|
||||
jsonparser.ArrayEach(str,
|
||||
func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
|
||||
@@ -162,7 +162,7 @@ func _parse_json_array_bool(str []byte, sequence *obiseq.BioSequence) ([]bool, e
|
||||
return values, nil
|
||||
}
|
||||
|
||||
func _parse_json_array_interface(str []byte) ([]interface{}, error) {
|
||||
func _parse_json_array_interface(str []byte, sequence *obiseq.BioSequence) ([]interface{}, error) {
|
||||
values := make([]interface{}, 0)
|
||||
jsonparser.ArrayEach(str,
|
||||
func(value []byte, dataType jsonparser.ValueType, offset int, err error) {
|
||||
@@ -261,14 +261,14 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
sequence.SetCount(int(count))
|
||||
|
||||
case skey == "obiclean_weight":
|
||||
weight, err := _parse_json_map_int(value)
|
||||
weight, err := _parse_json_map_int(value, sequence)
|
||||
if err != nil {
|
||||
log.Fatalf("%s: Cannot parse obiclean weight %s", sequence.Id(), string(value))
|
||||
}
|
||||
annotations[skey] = weight
|
||||
|
||||
case skey == "obiclean_status":
|
||||
status, err := _parse_json_map_string(value)
|
||||
status, err := _parse_json_map_string(value, sequence)
|
||||
if err != nil {
|
||||
log.Fatalf("%s: Cannot parse obiclean status %s", sequence.Id(), string(value))
|
||||
}
|
||||
@@ -276,7 +276,7 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
|
||||
case strings.HasPrefix(skey, "merged_"):
|
||||
if dataType == jsonparser.Object {
|
||||
data, err := _parse_json_map_int(value)
|
||||
data, err := _parse_json_map_int(value, sequence)
|
||||
if err != nil {
|
||||
log.Fatalf("%s: Cannot parse merged slot %s: %v", sequence.Id(), skey, err)
|
||||
} else {
|
||||
@@ -316,9 +316,9 @@ func _parse_json_header_(header string, sequence *obiseq.BioSequence) string {
|
||||
annotations[skey], err = strconv.ParseFloat(obiutils.UnsafeString(value), 64)
|
||||
}
|
||||
case jsonparser.Array:
|
||||
annotations[skey], err = _parse_json_array_interface(value)
|
||||
annotations[skey], err = _parse_json_array_interface(value, sequence)
|
||||
case jsonparser.Object:
|
||||
annotations[skey], err = _parse_json_map_interface(value)
|
||||
annotations[skey], err = _parse_json_map_interface(value, sequence)
|
||||
case jsonparser.Boolean:
|
||||
annotations[skey], err = jsonparser.ParseBoolean(value)
|
||||
case jsonparser.Null:
|
||||
|
||||
@@ -19,7 +19,7 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
|
||||
newiter.WaitAndClose()
|
||||
}()
|
||||
|
||||
f := func(iterator IBioSequence) {
|
||||
f := func(iterator IBioSequence, id int) {
|
||||
source := ""
|
||||
for iterator.Next() {
|
||||
news := obiseq.MakeBioSequenceSlice()
|
||||
@@ -66,9 +66,9 @@ func IFragments(minsize, length, overlap, size, nworkers int) Pipeable {
|
||||
}
|
||||
|
||||
for i := 1; i < nworkers; i++ {
|
||||
go f(iterator.Split())
|
||||
go f(iterator.Split(), i)
|
||||
}
|
||||
go f(iterator)
|
||||
go f(iterator, 0)
|
||||
|
||||
return newiter.SortBatches().Rebatch(size)
|
||||
}
|
||||
|
||||
@@ -8,9 +8,12 @@ import (
|
||||
"math/bits"
|
||||
"os"
|
||||
"slices"
|
||||
"sort"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obistats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/ef-ds/deque/v2"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
@@ -89,12 +92,18 @@ type DeBruijnGraph struct {
|
||||
//
|
||||
// *DeBruijnGraph - a pointer to the created De Bruijn's Graph
|
||||
func MakeDeBruijnGraph(kmersize int) *DeBruijnGraph {
|
||||
if kmersize > 31 {
|
||||
log.Panicf("k-mer size %d is too large", kmersize)
|
||||
}
|
||||
|
||||
kmermask := (^uint64(0) << (uint64(kmersize) * 2))
|
||||
|
||||
g := DeBruijnGraph{
|
||||
kmersize: kmersize,
|
||||
kmermask: ^(^uint64(0) << (uint64(kmersize) * 2)), // k-mer mask used to set to 0 the bits that are not in the k-mer
|
||||
prevc: uint64(1) << (uint64(kmersize-1) * 2),
|
||||
prevg: uint64(2) << (uint64(kmersize-1) * 2),
|
||||
prevt: uint64(3) << (uint64(kmersize-1) * 2),
|
||||
kmermask: kmermask, // k-mer mask used to set to 1 the bits that are not in the k-mer
|
||||
prevc: (uint64(1) << (uint64(kmersize-1) * 2)) | kmermask,
|
||||
prevg: (uint64(2) << (uint64(kmersize-1) * 2)) | kmermask,
|
||||
prevt: (uint64(3) << (uint64(kmersize-1) * 2)) | kmermask,
|
||||
graph: make(map[uint64]uint),
|
||||
}
|
||||
|
||||
@@ -161,19 +170,34 @@ func (g *DeBruijnGraph) FilterMinWeight(min int) {
|
||||
}
|
||||
}
|
||||
|
||||
// FilterMinWeight filters the DeBruijnGraph by removing nodes with weight less than the specified minimum.
|
||||
//
|
||||
// min: an integer representing the minimum count threshold.
|
||||
func (g *DeBruijnGraph) FilterMaxWeight(min int) {
|
||||
umin := uint(min)
|
||||
for idx, count := range g.graph {
|
||||
if count > umin {
|
||||
delete(g.graph, idx)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Previouses(index uint64) []uint64 {
|
||||
if _, ok := g.graph[index]; !ok {
|
||||
log.Panicf("k-mer %s (index %d) is not in graph", g.DecodeNode(index), index)
|
||||
}
|
||||
|
||||
rep := make([]uint64, 0, 4)
|
||||
|
||||
index &= ^g.kmermask
|
||||
index >>= 2
|
||||
|
||||
if _, ok := g.graph[index]; ok {
|
||||
rep = append(rep, index)
|
||||
key := index | g.kmermask
|
||||
if _, ok := g.graph[key]; ok {
|
||||
rep = append(rep, key)
|
||||
}
|
||||
|
||||
key := index | g.prevc
|
||||
key = index | g.prevc
|
||||
if _, ok := g.graph[key]; ok {
|
||||
rep = append(rep, key)
|
||||
}
|
||||
@@ -197,7 +221,7 @@ func (g *DeBruijnGraph) Nexts(index uint64) []uint64 {
|
||||
}
|
||||
|
||||
rep := make([]uint64, 0, 4)
|
||||
index = (index << 2) & g.kmermask
|
||||
index = (index << 2) | g.kmermask
|
||||
|
||||
if _, ok := g.graph[index]; ok {
|
||||
rep = append(rep, index)
|
||||
@@ -268,6 +292,33 @@ func (g *DeBruijnGraph) MaxHead() (uint64, int, bool) {
|
||||
return rep, int(max), found
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) Terminals() []uint64 {
|
||||
rep := make([]uint64, 0, 10)
|
||||
|
||||
for k := range g.graph {
|
||||
if len(g.Nexts(k)) == 0 {
|
||||
rep = append(rep, k)
|
||||
}
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxTerminal() (uint64, int, bool) {
|
||||
rep := uint64(0)
|
||||
max := uint(0)
|
||||
found := false
|
||||
for k, w := range g.graph {
|
||||
if len(g.Nexts(k)) == 0 && w > max {
|
||||
rep = k
|
||||
max = w
|
||||
found = true
|
||||
}
|
||||
}
|
||||
|
||||
return rep, int(max), found
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) MaxPath() []uint64 {
|
||||
path := make([]uint64, 0, 1000)
|
||||
ok := false
|
||||
@@ -318,7 +369,11 @@ func (g *DeBruijnGraph) LongestConsensus(id string, min_cov float64) (*obiseq.Bi
|
||||
return nil, fmt.Errorf("graph is empty")
|
||||
}
|
||||
//path := g.LongestPath(max_length)
|
||||
path := g.HaviestPath()
|
||||
path, err := g.HaviestPath(nil, nil, false)
|
||||
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
spath := path
|
||||
|
||||
@@ -481,7 +536,7 @@ func (graph *DeBruijnGraph) append(sequence []byte, current uint64, weight int)
|
||||
}
|
||||
|
||||
current <<= 2
|
||||
current &= graph.kmermask
|
||||
current |= graph.kmermask
|
||||
b := iupac[sequence[0]]
|
||||
current |= b[0]
|
||||
graph.graph[current] = uint(graph.Weight(current) + weight)
|
||||
@@ -495,6 +550,36 @@ func (graph *DeBruijnGraph) append(sequence []byte, current uint64, weight int)
|
||||
}
|
||||
}
|
||||
|
||||
// func (graph *DeBruijnGraph) search(current uint64, extension []byte, path []uint64, error,errormax int) ([]uint64,error) {
|
||||
|
||||
// path = append(path, current)
|
||||
|
||||
// if len(extension) == 0 {
|
||||
// return path,nil
|
||||
// }
|
||||
|
||||
// current <<= 2
|
||||
// current &= graph.kmermask
|
||||
// b := iupac[extension[0]]
|
||||
|
||||
// newPath := path
|
||||
// if len(b) > 1 {
|
||||
// newPath = slices.Clone(path)
|
||||
// }
|
||||
|
||||
// current |= b[0]
|
||||
|
||||
// _, ok := graph.graph[current]
|
||||
// if ok {
|
||||
// newPath = append(newPath, current)
|
||||
// }
|
||||
// rep, err := graph.search(current, extension[1:], newPath, error,errormax)
|
||||
// if err != nil {
|
||||
// return path,err
|
||||
// }
|
||||
|
||||
// }
|
||||
|
||||
// Push appends a BioSequence to the DeBruijnGraph.
|
||||
//
|
||||
// Parameters:
|
||||
@@ -523,6 +608,7 @@ func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
|
||||
initFirstKmer(start+1, key)
|
||||
}
|
||||
} else {
|
||||
key |= graph.kmermask
|
||||
graph.graph[key] = uint(graph.Weight(key) + w)
|
||||
graph.append(s[graph.kmersize:], key, w)
|
||||
}
|
||||
@@ -533,6 +619,110 @@ func (graph *DeBruijnGraph) Push(sequence *obiseq.BioSequence) {
|
||||
}
|
||||
}
|
||||
|
||||
func (graph *DeBruijnGraph) search(sequence []byte, mismatch, errormax int) []uint64 {
|
||||
var initFirstKmer func(start int, key uint64) []uint64
|
||||
|
||||
initFirstKmer = func(start int, key uint64) []uint64 {
|
||||
if start == graph.kmersize {
|
||||
key |= graph.kmermask
|
||||
if _, ok := graph.graph[key]; ok {
|
||||
return []uint64{key}
|
||||
} else {
|
||||
return []uint64{}
|
||||
}
|
||||
}
|
||||
|
||||
keys := make([]uint64, 0, 1000)
|
||||
|
||||
if start == 0 {
|
||||
key = 0
|
||||
}
|
||||
|
||||
key <<= 2
|
||||
b := iupac[sequence[start]]
|
||||
|
||||
for _, code := range b {
|
||||
key &= ^uint64(3)
|
||||
key |= code
|
||||
keys = append(keys, initFirstKmer(start+1, key)...)
|
||||
}
|
||||
|
||||
// w := []string{}
|
||||
// for _, k := range keys {
|
||||
// w = append(w, graph.DecodeNode(k))
|
||||
// }
|
||||
// // log.Warnf("For %s found %d matches : %v", sequence, len(keys), w)
|
||||
|
||||
return keys
|
||||
}
|
||||
|
||||
rep := initFirstKmer(0, 0)
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func (graph *DeBruijnGraph) Search(sequence *obiseq.BioSequence, errormax int) []uint64 {
|
||||
|
||||
s := sequence.Sequence() // Get the sequence as a byte slice
|
||||
|
||||
if len(s) < graph.KmerSize() {
|
||||
s = slices.Clone(s)
|
||||
for len(s) < graph.KmerSize() {
|
||||
s = append(s, 'n')
|
||||
}
|
||||
}
|
||||
|
||||
log.Warnf("searching for %s", s)
|
||||
keys := graph.search(s, 0, errormax)
|
||||
|
||||
for mismatch := 1; mismatch <= errormax; mismatch++ {
|
||||
log.Warnf("searching with %d error for %s", mismatch, s)
|
||||
for probe := range IterateOneError(s[0:graph.kmersize]) {
|
||||
keys = append(keys,
|
||||
graph.search(probe, mismatch, errormax)...,
|
||||
)
|
||||
}
|
||||
}
|
||||
keys = obiutils.Unique(keys)
|
||||
|
||||
return keys
|
||||
}
|
||||
|
||||
func (graph *DeBruijnGraph) BackSearch(sequence *obiseq.BioSequence, errormax int) []uint64 {
|
||||
lkmer := graph.KmerSize()
|
||||
|
||||
s := sequence.Sequence() // Get the sequence as a byte slice
|
||||
|
||||
if len(s) < lkmer {
|
||||
sn := []byte{}
|
||||
ls := len(s)
|
||||
for ls < lkmer {
|
||||
sn = append(sn, 'n')
|
||||
ls++
|
||||
}
|
||||
s = append(sn, s...)
|
||||
} else {
|
||||
s = s[(len(s) - lkmer):]
|
||||
}
|
||||
|
||||
log.Warnf("back-searching for %s", s)
|
||||
|
||||
keys := graph.search(s, 0, errormax)
|
||||
|
||||
for mismatch := 1; mismatch <= errormax; mismatch++ {
|
||||
log.Warnf("searching with %d error for %s", mismatch, s)
|
||||
for probe := range IterateOneError(s[0:graph.kmersize]) {
|
||||
// log.Warnf("searching with %d error for %s", mismatch, probe)
|
||||
keys = append(keys,
|
||||
graph.search(probe, mismatch, errormax)...,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
keys = obiutils.Unique(keys)
|
||||
return keys
|
||||
}
|
||||
|
||||
func (graph *DeBruijnGraph) Gml() string {
|
||||
buffer := bytes.NewBuffer(make([]byte, 0, 1000))
|
||||
|
||||
@@ -614,7 +804,7 @@ func (graph *DeBruijnGraph) WriteGml(filename string) error {
|
||||
func (g *DeBruijnGraph) HammingDistance(kmer1, kmer2 uint64) int {
|
||||
ident := ^((kmer1 & kmer2) | (^kmer1 & ^kmer2))
|
||||
ident |= (ident >> 1)
|
||||
ident &= 0x5555555555555555 & g.kmermask
|
||||
ident &= 0x5555555555555555 & ^g.kmermask
|
||||
return bits.OnesCount64(ident)
|
||||
}
|
||||
|
||||
@@ -638,11 +828,23 @@ func (h *UInt64Heap) Pop() any {
|
||||
return x
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) HaviestPath() []uint64 {
|
||||
func (g *DeBruijnGraph) HaviestPath(starts, stops []uint64, backPath bool) ([]uint64, error) {
|
||||
|
||||
if g.HasCycle() {
|
||||
return nil
|
||||
// if g.HasCycle() {
|
||||
// return nil, fmt.Errorf("graph has a cycle")
|
||||
// }
|
||||
|
||||
following := g.Nexts
|
||||
|
||||
if backPath {
|
||||
following = g.Previouses
|
||||
}
|
||||
|
||||
stopNodes := make(map[uint64]bool, len(stops))
|
||||
for _, n := range stops {
|
||||
stopNodes[n] = true
|
||||
}
|
||||
|
||||
// Initialize the distance array and visited set
|
||||
distances := make(map[uint64]int)
|
||||
visited := make(map[uint64]bool)
|
||||
@@ -654,7 +856,11 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 {
|
||||
heap.Init(queue)
|
||||
|
||||
startNodes := make(map[uint64]struct{})
|
||||
for _, n := range g.Heads() {
|
||||
if starts == nil {
|
||||
starts = g.Heads()
|
||||
}
|
||||
|
||||
for _, n := range starts {
|
||||
startNodes[n] = struct{}{}
|
||||
heap.Push(queue, n)
|
||||
distances[n] = g.Weight(n)
|
||||
@@ -686,7 +892,11 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 {
|
||||
log.Warn("current node is 0")
|
||||
}
|
||||
// Update the distance of the neighbors
|
||||
nextNodes := g.Nexts(currentNode)
|
||||
|
||||
nextNodes := following(currentNode)
|
||||
if _, ok := stopNodes[currentNode]; ok {
|
||||
nextNodes = []uint64{}
|
||||
}
|
||||
for _, nextNode := range nextNodes {
|
||||
if nextNode == 0 {
|
||||
log.Warn("next node is 0")
|
||||
@@ -718,16 +928,178 @@ func (g *DeBruijnGraph) HaviestPath() []uint64 {
|
||||
}
|
||||
|
||||
if slices.Contains(heaviestPath, currentNode) {
|
||||
log.Panicf("Cycle detected %v -> %v (%v) len(%v), graph: %v", heaviestPath, currentNode, startNodes, len(heaviestPath), g.Len())
|
||||
return nil
|
||||
return nil, fmt.Errorf("cycle detected in heaviest path")
|
||||
}
|
||||
|
||||
heaviestPath = append(heaviestPath, currentNode)
|
||||
|
||||
// Reverse the path
|
||||
slices.Reverse(heaviestPath)
|
||||
if !backPath {
|
||||
slices.Reverse(heaviestPath)
|
||||
}
|
||||
|
||||
return heaviestPath
|
||||
return heaviestPath, nil
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) HaviestPathDSU(starts, stops []uint64, backPath bool) ([]uint64, error) {
|
||||
// Collect and sort edges
|
||||
type Edge struct {
|
||||
weight float64
|
||||
u, v uint64
|
||||
}
|
||||
edges := make([]Edge, 0)
|
||||
|
||||
// Function to get next nodes (either Nexts or Previouses based on backPath)
|
||||
following := g.Nexts
|
||||
previouses := g.Previouses
|
||||
if backPath {
|
||||
following = g.Previouses
|
||||
previouses = g.Nexts
|
||||
}
|
||||
|
||||
// Collect all edges
|
||||
for u := range g.graph {
|
||||
for _, v := range following(u) {
|
||||
edges = append(edges, Edge{
|
||||
weight: float64(min(g.Weight(u), g.Weight(v))),
|
||||
u: u,
|
||||
v: v,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// Sort edges by weight in descending order
|
||||
sort.Slice(edges, func(i, j int) bool {
|
||||
return edges[i].weight > edges[j].weight
|
||||
})
|
||||
|
||||
// Initialize disjoint set data structure
|
||||
parent := make(map[uint64]uint64)
|
||||
for u := range g.graph {
|
||||
parent[u] = u
|
||||
}
|
||||
|
||||
// Find with path compression
|
||||
var find func(uint64) uint64
|
||||
find = func(node uint64) uint64 {
|
||||
if parent[node] != node {
|
||||
parent[node] = find(parent[node])
|
||||
}
|
||||
return parent[node]
|
||||
}
|
||||
|
||||
// Union function that returns true if cycle is detected
|
||||
union := func(u, v uint64) bool {
|
||||
rootU := find(u)
|
||||
rootV := find(v)
|
||||
if rootU == rootV {
|
||||
return true // Cycle detected
|
||||
}
|
||||
parent[rootV] = rootU
|
||||
return false
|
||||
}
|
||||
|
||||
// If no specific starts provided, use graph heads
|
||||
if starts == nil {
|
||||
if !backPath {
|
||||
starts = g.Heads()
|
||||
} else {
|
||||
starts = g.Terminals()
|
||||
}
|
||||
}
|
||||
|
||||
// If no specific stops provided, use graph terminals
|
||||
if stops == nil {
|
||||
if !backPath {
|
||||
stops = g.Terminals()
|
||||
} else {
|
||||
stops = g.Heads()
|
||||
}
|
||||
}
|
||||
|
||||
// Convert stops to a map for O(1) lookup
|
||||
stopNodes := make(map[uint64]bool)
|
||||
for _, stop := range stops {
|
||||
stopNodes[stop] = false
|
||||
}
|
||||
|
||||
var path []uint64
|
||||
maxCapacity := math.Inf(-1)
|
||||
stopEdge := []Edge{}
|
||||
|
||||
// Process edges in descending order of weight
|
||||
for _, edge := range edges {
|
||||
if stopNodes[edge.u] {
|
||||
continue // Skip edges from stop nodes
|
||||
}
|
||||
|
||||
if in, ok := stopNodes[edge.v]; ok {
|
||||
if !in {
|
||||
stopEdge = append(stopEdge, edge)
|
||||
stopNodes[edge.v] = true
|
||||
}
|
||||
}
|
||||
|
||||
if union(edge.u, edge.v) {
|
||||
continue // Skip if creates cycle
|
||||
}
|
||||
|
||||
pathFound := false
|
||||
for _, sedge := range stopEdge {
|
||||
// Check if any start-stop pair is connected
|
||||
fv := find(sedge.v)
|
||||
for _, s := range starts {
|
||||
fs := find(s)
|
||||
// log.Warnf("Start: %d, Stop: %d", fs, fv)
|
||||
if fs == fv {
|
||||
pathFound = true
|
||||
maxCapacity = edge.weight
|
||||
|
||||
// Reconstruct path
|
||||
current := sedge.v
|
||||
path = []uint64{current}
|
||||
for current != s {
|
||||
oldcurrent := current
|
||||
// log.Warnf("Start: %d, Current: %d, Previous: %v", s, current, previouses(current))
|
||||
for _, prev := range previouses(current) {
|
||||
if find(prev) == fs {
|
||||
path = append(path, prev)
|
||||
current = prev
|
||||
break
|
||||
}
|
||||
}
|
||||
if current == oldcurrent {
|
||||
log.Fatalf("We are stuck")
|
||||
}
|
||||
|
||||
}
|
||||
// log.Warnf("Built path: %v", path)
|
||||
break
|
||||
}
|
||||
}
|
||||
if pathFound {
|
||||
break
|
||||
}
|
||||
}
|
||||
if pathFound {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// log.Warnf("Stop edge: %v", stopEdge)
|
||||
|
||||
// Process edges in descending order of weight
|
||||
|
||||
if path == nil {
|
||||
return nil, fmt.Errorf("no valid path found")
|
||||
}
|
||||
|
||||
if !backPath {
|
||||
slices.Reverse(path)
|
||||
}
|
||||
log.Warnf("Max capacity: %5.0f: %v", maxCapacity, g.DecodePath(path))
|
||||
|
||||
return path, nil
|
||||
}
|
||||
|
||||
func (g *DeBruijnGraph) HasCycle() bool {
|
||||
@@ -765,3 +1137,59 @@ func (g *DeBruijnGraph) HasCycle() bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// HasCycleInDegree détecte la présence d'un cycle dans le graphe en utilisant la méthode des degrés entrants.
|
||||
// Cette méthode est basée sur le tri topologique : si on ne peut pas trier tous les nœuds,
|
||||
// alors il y a un cycle.
|
||||
//
|
||||
// Returns:
|
||||
// - bool: true si le graphe contient un cycle, false sinon
|
||||
func (g *DeBruijnGraph) HasCycleInDegree() bool {
|
||||
// Créer une map pour stocker les degrés entrants de chaque nœud
|
||||
inDegree := make(map[uint64]int)
|
||||
|
||||
// Initialiser les degrés entrants à 0 pour tous les nœuds
|
||||
for node := range g.graph {
|
||||
inDegree[node] = 0
|
||||
}
|
||||
|
||||
// Calculer les degrés entrants
|
||||
for node := range g.graph {
|
||||
for _, next := range g.Nexts(node) {
|
||||
inDegree[next]++
|
||||
}
|
||||
}
|
||||
|
||||
// Créer une deque pour stocker les nœuds avec un degré entrant de 0
|
||||
queue := deque.Deque[uint64]{}
|
||||
|
||||
// Ajouter tous les nœuds avec un degré entrant de 0 à la deque
|
||||
for node := range g.graph {
|
||||
if inDegree[node] == 0 {
|
||||
queue.PushBack(node)
|
||||
}
|
||||
}
|
||||
|
||||
visited := 0 // Compteur de nœuds visités
|
||||
|
||||
// Parcours BFS
|
||||
for queue.Len() > 0 {
|
||||
// Retirer le premier nœud de la deque
|
||||
node, _ := queue.PopFront()
|
||||
visited++
|
||||
|
||||
// Pour chaque nœud adjacent
|
||||
for _, next := range g.Nexts(node) {
|
||||
// Réduire son degré entrant
|
||||
inDegree[next]--
|
||||
|
||||
// Si le degré entrant devient 0, l'ajouter à la deque
|
||||
if inDegree[next] == 0 {
|
||||
queue.PushBack(next)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// S'il y a un cycle, on n'aura pas pu visiter tous les nœuds
|
||||
return visited != len(g.graph)
|
||||
}
|
||||
|
||||
@@ -2,7 +2,6 @@ package obikmer
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
)
|
||||
|
||||
var __single_base_code__ = []byte{0,
|
||||
@@ -132,39 +131,33 @@ func FastShiftFourMer(index [][]int, shifts *map[int]int, lindex int, seq *obise
|
||||
maxshift := 0
|
||||
maxcount := 0
|
||||
maxscore := -1.0
|
||||
maxrelscore := -1.0
|
||||
|
||||
for shift, count := range *shifts {
|
||||
delete((*shifts), shift)
|
||||
selectscore := float64(count)
|
||||
relativescore := float64(count)
|
||||
over := -shift
|
||||
switch {
|
||||
case shift > 0:
|
||||
over += lindex
|
||||
case shift < 0:
|
||||
over = seq.Len() - over
|
||||
default:
|
||||
over = min(lindex, seq.Len())
|
||||
}
|
||||
relativescore = relativescore / float64(over-3)
|
||||
score := float64(count)
|
||||
if relscore {
|
||||
selectscore = relativescore
|
||||
over := -shift
|
||||
switch {
|
||||
case shift > 0:
|
||||
over += lindex
|
||||
case shift < 0:
|
||||
over = seq.Len() - over
|
||||
default:
|
||||
over = min(lindex, seq.Len())
|
||||
}
|
||||
score = score / float64(over-3)
|
||||
}
|
||||
|
||||
if selectscore > maxscore {
|
||||
if score > maxscore {
|
||||
maxshift = shift
|
||||
maxcount = count
|
||||
maxscore = selectscore
|
||||
maxrelscore = relativescore
|
||||
maxscore = score
|
||||
} else {
|
||||
if selectscore == maxscore && obiutils.Abs(shift) < obiutils.Abs(maxshift) {
|
||||
if score == maxscore && shift < maxshift {
|
||||
maxshift = shift
|
||||
maxcount = count
|
||||
maxrelscore = relativescore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return maxshift, maxcount, maxrelscore
|
||||
return maxshift, maxcount, maxscore
|
||||
}
|
||||
|
||||
45
pkg/obikmer/oneerror.go
Normal file
45
pkg/obikmer/oneerror.go
Normal file
@@ -0,0 +1,45 @@
|
||||
package obikmer
|
||||
|
||||
import (
|
||||
"iter"
|
||||
"slices"
|
||||
)
|
||||
|
||||
var baseError = map[byte]byte{
|
||||
'a': 'b',
|
||||
'c': 'd',
|
||||
'g': 'h',
|
||||
't': 'v',
|
||||
'r': 'y',
|
||||
'y': 'r',
|
||||
's': 'w',
|
||||
'w': 's',
|
||||
'k': 'm',
|
||||
'm': 'k',
|
||||
'd': 'c',
|
||||
'v': 't',
|
||||
'h': 'g',
|
||||
'b': 'a',
|
||||
}
|
||||
|
||||
type BytesItem []byte
|
||||
|
||||
func IterateOneError(kmer []byte) iter.Seq[BytesItem] {
|
||||
lkmer := len(kmer)
|
||||
return func(yield func(BytesItem) bool) {
|
||||
for p := 0; p < lkmer; p++ {
|
||||
for p < lkmer && kmer[p] == 'n' {
|
||||
p++
|
||||
}
|
||||
|
||||
if p < lkmer {
|
||||
nkmer := slices.Clone(kmer)
|
||||
nkmer[p] = baseError[kmer[p]]
|
||||
if !yield(nkmer) {
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
// corresponds to the last commit, and not the one when the file will be
|
||||
// commited
|
||||
|
||||
var _Commit = "573acaf"
|
||||
var _Commit = "4774438"
|
||||
var _Version = "Release 4.2.0"
|
||||
|
||||
// Version returns the version of the obitools package.
|
||||
|
||||
@@ -25,7 +25,7 @@ func (s *BioSequence) UnPair() {
|
||||
}
|
||||
|
||||
func (s *BioSequenceSlice) IsPaired() bool {
|
||||
return (*s)[0].paired != nil
|
||||
return s != nil && s.Len() > 0 && (*s)[0].paired != nil
|
||||
}
|
||||
|
||||
func (s *BioSequenceSlice) PairedWith() *BioSequenceSlice {
|
||||
|
||||
@@ -196,16 +196,6 @@ func IsShorterOrEqualTo(length int) SequencePredicate {
|
||||
return f
|
||||
}
|
||||
|
||||
func OccurInAtleast(sample string, n int) SequencePredicate {
|
||||
desc := MakeStatsOnDescription(sample)
|
||||
f := func(sequence *BioSequence) bool {
|
||||
stats := sequence.StatsOn(desc, "NA")
|
||||
return len(stats) >= n
|
||||
}
|
||||
|
||||
return f
|
||||
}
|
||||
|
||||
func IsSequenceMatch(pattern string) SequencePredicate {
|
||||
pat, err := regexp.Compile("(?i)" + pattern)
|
||||
|
||||
|
||||
@@ -31,7 +31,7 @@ func TaxonomyClassifier(taxonomicRank string,
|
||||
if taxon != nil {
|
||||
ttaxon := taxon.TaxonAtRank(taxonomicRank)
|
||||
if abortOnMissing && ttaxon == nil {
|
||||
log.Fatalf("Taxon at rank %s not found in taxonomy for taxid %s", taxonomicRank, taxon.String())
|
||||
log.Fatalf("Taxon at rank %s not found in taxonomy for taxid %d", taxonomicRank, taxon.String())
|
||||
}
|
||||
} else {
|
||||
if abortOnMissing {
|
||||
|
||||
@@ -25,7 +25,7 @@ func IsAValidTaxon(taxonomy *obitax.Taxonomy, withAutoCorrection ...bool) Sequen
|
||||
if autocorrection {
|
||||
sequence.SetTaxid(ttaxid)
|
||||
log.Printf(
|
||||
"Sequence %s : Taxid %s updated with %s",
|
||||
"Sequence %s : Taxid %d updated with %d",
|
||||
sequence.Id(),
|
||||
taxid,
|
||||
ttaxid,
|
||||
|
||||
@@ -1,126 +0,0 @@
|
||||
package obiclean
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sort"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
func commonPrefix(a, b *obiseq.BioSequence) int {
|
||||
i := 0
|
||||
l := min(a.Len(), b.Len())
|
||||
|
||||
if l == 0 {
|
||||
return 0
|
||||
}
|
||||
as := a.Sequence()
|
||||
bs := b.Sequence()
|
||||
|
||||
for i < l && as[i] == bs[i] {
|
||||
i++
|
||||
}
|
||||
|
||||
if obiutils.UnsafeString(as[:i]) != obiutils.UnsafeString(bs[:i]) {
|
||||
log.Fatalf("i: %d, j: %d (%s/%s)", i, i, as[:i], bs[:i])
|
||||
}
|
||||
|
||||
return i
|
||||
}
|
||||
|
||||
func commonSuffix(a, b *obiseq.BioSequence) int {
|
||||
i := a.Len() - 1
|
||||
j := b.Len() - 1
|
||||
|
||||
if i < 0 || j < 0 {
|
||||
return 0
|
||||
}
|
||||
|
||||
as := a.Sequence()
|
||||
bs := b.Sequence()
|
||||
|
||||
l := 0
|
||||
for i >= 0 && j >= 0 && as[i] == bs[j] {
|
||||
i--
|
||||
j--
|
||||
l++
|
||||
}
|
||||
|
||||
if obiutils.UnsafeString(as[i+1:]) != obiutils.UnsafeString(bs[j+1:]) {
|
||||
log.Fatalf("i: %d, j: %d (%s/%s)", i, j, as[i+1:], bs[j+1:])
|
||||
}
|
||||
// log.Warnf("i: %d, j: %d (%s)", i, j, as[i+1:])
|
||||
|
||||
return l
|
||||
}
|
||||
|
||||
func AnnotateChimera(samples map[string]*[]*seqPCR) {
|
||||
|
||||
w := func(sample string, seqs *[]*seqPCR) {
|
||||
ls := len(*seqs)
|
||||
cp := make([]int, ls)
|
||||
cs := make([]int, ls)
|
||||
|
||||
pcrs := make([]*seqPCR, 0, ls)
|
||||
|
||||
for _, s := range *seqs {
|
||||
if len(s.Edges) == 0 {
|
||||
pcrs = append(pcrs, s)
|
||||
}
|
||||
}
|
||||
|
||||
lp := len(pcrs)
|
||||
|
||||
sort.Slice(pcrs, func(i, j int) bool {
|
||||
return pcrs[i].Weight < pcrs[j].Weight
|
||||
})
|
||||
|
||||
for i, s := range pcrs {
|
||||
for j := i + 1; j < lp; j++ {
|
||||
s2 := pcrs[j]
|
||||
cp[j] = commonPrefix(s.Sequence, s2.Sequence)
|
||||
cs[j] = commonSuffix(s.Sequence, s2.Sequence)
|
||||
}
|
||||
|
||||
var cm map[string]string
|
||||
var err error
|
||||
|
||||
chimera, ok := s.Sequence.GetAttribute("chimera")
|
||||
|
||||
if !ok {
|
||||
cm = map[string]string{}
|
||||
} else {
|
||||
cm, err = obiutils.InterfaceToStringMap(chimera)
|
||||
if err != nil {
|
||||
log.Fatalf("type of chimera not map[string]string: %T (%v)",
|
||||
chimera, err)
|
||||
}
|
||||
}
|
||||
|
||||
ls := s.Sequence.Len()
|
||||
|
||||
for k := i + 1; k < lp; k++ {
|
||||
for l := i + 1; l < lp; l++ {
|
||||
if k != l && cp[k]+cs[l] == ls {
|
||||
cm[sample] = fmt.Sprintf("{%s}/{%s}@(%d)",
|
||||
pcrs[k].Sequence.Id(),
|
||||
pcrs[l].Sequence.Id(),
|
||||
cp[k])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(cm) > 0 {
|
||||
s.Sequence.SetAttribute("chimera", cm)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for sn, sqs := range samples {
|
||||
w(sn, sqs)
|
||||
}
|
||||
|
||||
}
|
||||
@@ -13,24 +13,23 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obialign"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiutils"
|
||||
"github.com/schollz/progressbar/v3"
|
||||
)
|
||||
|
||||
type Ratio struct {
|
||||
Sample string
|
||||
SeqID string
|
||||
OriginalStatus string
|
||||
WOriginal int
|
||||
WMutant int
|
||||
COriginal int
|
||||
CMutant int
|
||||
Pos int
|
||||
Length int
|
||||
A int
|
||||
C int
|
||||
G int
|
||||
T int
|
||||
Sample string
|
||||
SeqID string
|
||||
status string
|
||||
From int
|
||||
To int
|
||||
CFrom int
|
||||
CTo int
|
||||
Pos int
|
||||
Length int
|
||||
A int
|
||||
C int
|
||||
G int
|
||||
T int
|
||||
}
|
||||
|
||||
type Edge struct {
|
||||
@@ -53,21 +52,45 @@ func makeEdge(father, dist, pos int, from, to byte) Edge {
|
||||
}
|
||||
}
|
||||
|
||||
func abs(x int) int {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
func max(x, y int) int {
|
||||
if x > y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
||||
|
||||
func min(x, y int) int {
|
||||
if x < y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
||||
|
||||
func minMax(x, y int) (int, int) {
|
||||
if x < y {
|
||||
return x, y
|
||||
}
|
||||
return y, x
|
||||
|
||||
}
|
||||
|
||||
// It takes a filename and a 2D slice of floats pruduced during graph building,
|
||||
// and writes a CSV file with the first column being the
|
||||
// first nucleotide, the second column being the second nucleotide, and the third column being the
|
||||
// ratio
|
||||
func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
||||
func EmpiricalDistCsv(filename string, data [][]Ratio) {
|
||||
file, err := os.Create(filename)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
|
||||
destfile, err := obiutils.CompressStream(file, true, true)
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
}
|
||||
defer destfile.Close()
|
||||
defer file.Close()
|
||||
|
||||
pbopt := make([]progressbar.Option, 0, 5)
|
||||
pbopt = append(pbopt,
|
||||
@@ -80,19 +103,19 @@ func EmpiricalDistCsv(filename string, data [][]Ratio, compressed bool) {
|
||||
|
||||
bar := progressbar.NewOptions(len(data), pbopt...)
|
||||
|
||||
fmt.Fprintln(destfile, "Sample,Origin_id,Origin_status,Origin,Mutant,Origin_Weight,Mutant_Weight,Origin_Count,Mutant_Count,Position,Origin_length,A,C,G,T")
|
||||
fmt.Fprintln(file, "Sample,Father_id,Father_status,From,To,Weight_from,Weight_to,Count_from,Count_to,Position,length,A,C,G,T")
|
||||
for code, dist := range data {
|
||||
a1, a2 := intToNucPair(code)
|
||||
for _, ratio := range dist {
|
||||
fmt.Fprintf(destfile, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
|
||||
fmt.Fprintf(file, "%s,%s,%s,%c,%c,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d\n",
|
||||
ratio.Sample,
|
||||
ratio.SeqID,
|
||||
ratio.OriginalStatus,
|
||||
ratio.status,
|
||||
a1, a2,
|
||||
ratio.WOriginal,
|
||||
ratio.WMutant,
|
||||
ratio.COriginal,
|
||||
ratio.CMutant,
|
||||
ratio.From,
|
||||
ratio.To,
|
||||
ratio.CFrom,
|
||||
ratio.CTo,
|
||||
ratio.Pos,
|
||||
ratio.Length,
|
||||
ratio.A,
|
||||
@@ -455,20 +478,16 @@ func EstimateRatio(samples map[string]*[]*seqPCR, minStatRatio int) [][]Ratio {
|
||||
if father.Weight >= minStatRatio && edge.Dist == 1 {
|
||||
s := father.Sequence.Sequence()
|
||||
ratio[edge.NucPair] = append(ratio[edge.NucPair],
|
||||
Ratio{
|
||||
Sample: name,
|
||||
SeqID: father.Sequence.Id(),
|
||||
OriginalStatus: Status(father.Sequence)[name],
|
||||
WOriginal: father.Weight,
|
||||
WMutant: seq.Weight,
|
||||
COriginal: father.Count,
|
||||
CMutant: seq.Count,
|
||||
Pos: edge.Pos,
|
||||
Length: father.Sequence.Len(),
|
||||
A: bytes.Count(s, []byte("a")),
|
||||
C: bytes.Count(s, []byte("c")),
|
||||
G: bytes.Count(s, []byte("g")),
|
||||
T: bytes.Count(s, []byte("t"))})
|
||||
Ratio{name,
|
||||
father.Sequence.Id(), Status(father.Sequence)[name],
|
||||
father.Weight, seq.Weight,
|
||||
father.Count, seq.Count,
|
||||
edge.Pos,
|
||||
father.Sequence.Len(),
|
||||
bytes.Count(s, []byte("a")),
|
||||
bytes.Count(s, []byte("c")),
|
||||
bytes.Count(s, []byte("g")),
|
||||
bytes.Count(s, []byte("t"))})
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,7 +2,6 @@ package obiclean
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"maps"
|
||||
"os"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
@@ -20,7 +19,6 @@ type seqPCR struct {
|
||||
Sequence *obiseq.BioSequence // pointer to the corresponding sequence
|
||||
SonCount int
|
||||
AddedSons int
|
||||
IsHead bool
|
||||
Edges []Edge
|
||||
Cluster map[int]bool // used as the set of head sequences associated to that sequence
|
||||
}
|
||||
@@ -52,7 +50,6 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
|
||||
Sequence: s,
|
||||
SonCount: 0,
|
||||
AddedSons: 0,
|
||||
IsHead: false,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -60,7 +57,9 @@ func buildSamples(dataset obiseq.BioSequenceSlice,
|
||||
return samples
|
||||
}
|
||||
|
||||
func annotateOBIClean(source string, dataset obiseq.BioSequenceSlice) obiiter.IBioSequence {
|
||||
func annotateOBIClean(source string, dataset obiseq.BioSequenceSlice,
|
||||
sample map[string]*([]*seqPCR),
|
||||
tag, NAValue string) obiiter.IBioSequence {
|
||||
batchsize := 1000
|
||||
var annot = func(data obiseq.BioSequenceSlice) (obiseq.BioSequenceSlice, error) {
|
||||
|
||||
@@ -115,28 +114,6 @@ func IsHead(sequence *obiseq.BioSequence) bool {
|
||||
return ishead
|
||||
}
|
||||
|
||||
func NotAlwaysChimera(tag string) obiseq.SequencePredicate {
|
||||
descriptor := obiseq.MakeStatsOnDescription(tag)
|
||||
predicat := func(sequence *obiseq.BioSequence) bool {
|
||||
|
||||
chimera, ok := sequence.GetStringMap("chimera")
|
||||
if !ok || len(chimera) == 0 {
|
||||
return true
|
||||
}
|
||||
samples := maps.Keys(sequence.StatsOn(descriptor, "NA"))
|
||||
|
||||
for s := range samples {
|
||||
if _, ok := chimera[s]; !ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
return predicat
|
||||
}
|
||||
|
||||
func HeadCount(sequence *obiseq.BioSequence) int {
|
||||
var err error
|
||||
annotation := sequence.Annotations()
|
||||
@@ -260,7 +237,6 @@ func Mutation(sample map[string]*([]*seqPCR)) {
|
||||
}
|
||||
|
||||
func Status(sequence *obiseq.BioSequence) map[string]string {
|
||||
var err error
|
||||
annotation := sequence.Annotations()
|
||||
iobistatus, ok := annotation["obiclean_status"]
|
||||
var obistatus map[string]string
|
||||
@@ -270,9 +246,9 @@ func Status(sequence *obiseq.BioSequence) map[string]string {
|
||||
case map[string]string:
|
||||
obistatus = iobistatus
|
||||
case map[string]interface{}:
|
||||
obistatus, err = obiutils.InterfaceToStringMap(obistatus)
|
||||
if err != nil {
|
||||
log.Panicf("obiclean_status attribute of sequence %s must be castable to a map[string]string", sequence.Id())
|
||||
obistatus = make(map[string]string)
|
||||
for k, v := range iobistatus {
|
||||
obistatus[k] = fmt.Sprint(v)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
@@ -380,30 +356,19 @@ func CLIOBIClean(itertator obiiter.IBioSequence) obiiter.IBioSequence {
|
||||
}
|
||||
}
|
||||
|
||||
if DetectChimera() {
|
||||
AnnotateChimera(samples)
|
||||
}
|
||||
|
||||
if SaveGraphToFiles() {
|
||||
SaveGMLGraphs(GraphFilesDirectory(), samples, MinCountToEvalMutationRate())
|
||||
}
|
||||
|
||||
if IsSaveRatioTable() {
|
||||
all_ratio := EstimateRatio(samples, MinCountToEvalMutationRate())
|
||||
EmpiricalDistCsv(RatioTableFilename(), all_ratio, obidefault.CompressOutput())
|
||||
EmpiricalDistCsv(RatioTableFilename(), all_ratio)
|
||||
}
|
||||
|
||||
iter := annotateOBIClean(source, db)
|
||||
iter := annotateOBIClean(source, db, samples, SampleAttribute(), "NA")
|
||||
|
||||
if OnlyHead() {
|
||||
iter = iter.FilterOn(IsHead,
|
||||
obidefault.BatchSize()).FilterOn(NotAlwaysChimera(SampleAttribute()),
|
||||
obidefault.BatchSize())
|
||||
}
|
||||
|
||||
if MinSampleCount() > 1 {
|
||||
sc := obiseq.OccurInAtleast(SampleAttribute(), MinSampleCount())
|
||||
iter = iter.FilterOn(sc, obidefault.BatchSize())
|
||||
iter = iter.FilterOn(IsHead, 1000)
|
||||
}
|
||||
|
||||
return iter
|
||||
|
||||
@@ -16,8 +16,6 @@ var _onlyHead = false
|
||||
|
||||
var _saveGraph = "__@@NOSAVE@@__"
|
||||
var _saveRatio = "__@@NOSAVE@@__"
|
||||
var _minSample = 1
|
||||
var _detectChimera = false
|
||||
|
||||
func ObicleanOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_sampleAttribute, "sample", _sampleAttribute,
|
||||
@@ -57,13 +55,6 @@ func ObicleanOptionSet(options *getoptions.GetOpt) {
|
||||
"The ratio file follows the csv format."),
|
||||
)
|
||||
|
||||
options.IntVar(&_minSample, "min-sample-count", _minSample,
|
||||
options.Description("Minimum number of samples a sequence must be present in to be considered in the analysis."),
|
||||
)
|
||||
|
||||
options.BoolVar(&_detectChimera, "detect-chimera", _detectChimera,
|
||||
options.Description("Detect chimera sequences."),
|
||||
)
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
@@ -120,13 +111,3 @@ func IsSaveRatioTable() bool {
|
||||
func RatioTableFilename() string {
|
||||
return _saveRatio
|
||||
}
|
||||
|
||||
// It returns the minimum number of samples a sequence must be present in to be considered in the analysis
|
||||
func MinSampleCount() int {
|
||||
return _minSample
|
||||
}
|
||||
|
||||
// It returns true if chimera detection is enabled
|
||||
func DetectChimera() bool {
|
||||
return _detectChimera
|
||||
}
|
||||
|
||||
@@ -31,6 +31,7 @@ var __output_fastjson_format__ = false
|
||||
var __output_fastobi_format__ = false
|
||||
|
||||
var __no_progress_bar__ = false
|
||||
var __compressed__ = false
|
||||
var __skip_empty__ = false
|
||||
|
||||
var __output_file_name__ = "-"
|
||||
|
||||
@@ -21,7 +21,7 @@ func BuildPairedFileNames(filename string) (string, string) {
|
||||
forward := parts[0] + "_R1"
|
||||
reverse := parts[0] + "_R2"
|
||||
|
||||
if len(parts) > 1 && parts[1] != "" {
|
||||
if parts[1] != "" {
|
||||
suffix := "." + parts[1]
|
||||
forward += suffix
|
||||
reverse += suffix
|
||||
|
||||
@@ -6,7 +6,6 @@ import (
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obidefault"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitax"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
@@ -17,7 +16,6 @@ import (
|
||||
var _BelongTaxa = make([]string, 0)
|
||||
var _NotBelongTaxa = make([]string, 0)
|
||||
var _RequiredRanks = make([]string, 0)
|
||||
var _ValidateTaxonomy = false
|
||||
|
||||
var _MinimumLength = 1
|
||||
var _MaximumLength = int(2e9)
|
||||
@@ -64,9 +62,6 @@ func TaxonomySelectionOptionSet(options *getoptions.GetOpt) {
|
||||
options.ArgName("RANK_NAME"),
|
||||
options.Description("Select sequences belonging a taxon with a rank <RANK_NAME>"))
|
||||
|
||||
options.BoolVar(&_ValidateTaxonomy, "valid-taxid", _ValidateTaxonomy,
|
||||
options.Description("Validate the taxonomic classification of the sequences."))
|
||||
|
||||
}
|
||||
|
||||
func SequenceSelectionOptionSet(options *getoptions.GetOpt) {
|
||||
@@ -276,27 +271,6 @@ func CLIRestrictTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIIsValidTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
if _ValidateTaxonomy {
|
||||
if !obidefault.HasSelectedTaxonomy() {
|
||||
log.Fatal("Taxonomy not found")
|
||||
}
|
||||
taxonomy := obitax.DefaultTaxonomy()
|
||||
if taxonomy == nil {
|
||||
log.Fatal("Taxonomy not found")
|
||||
}
|
||||
|
||||
predicat := func(sequences *obiseq.BioSequence) bool {
|
||||
taxon := sequences.Taxon(taxonomy)
|
||||
return taxon != nil
|
||||
}
|
||||
|
||||
return predicat
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func CLIAvoidTaxonomyPredicate() obiseq.SequencePredicate {
|
||||
var p obiseq.SequencePredicate
|
||||
var p2 obiseq.SequencePredicate
|
||||
@@ -345,7 +319,7 @@ func CLIHasRankDefinedPredicate() obiseq.SequencePredicate {
|
||||
}
|
||||
|
||||
func CLITaxonomyFilterPredicate() obiseq.SequencePredicate {
|
||||
return CLIIsValidTaxonomyPredicate().And(CLIAvoidTaxonomyPredicate()).And(CLIHasRankDefinedPredicate()).And(CLIRestrictTaxonomyPredicate())
|
||||
return CLIHasRankDefinedPredicate().And(CLIRestrictTaxonomyPredicate()).And(CLIAvoidTaxonomyPredicate())
|
||||
}
|
||||
|
||||
func CLIPredicatesPredicate() obiseq.SequencePredicate {
|
||||
|
||||
520
pkg/obitools/obimicroasm/microasm.go
Normal file
520
pkg/obitools/obimicroasm/microasm.go
Normal file
@@ -0,0 +1,520 @@
|
||||
package obimicroasm
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path"
|
||||
"slices"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiformats"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obikmer"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiseq"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obisuffix"
|
||||
)
|
||||
|
||||
func BuildFilterOnPatternReadPairWorker(
|
||||
forward, reverse string,
|
||||
errormax int,
|
||||
cutReads bool,
|
||||
) obiseq.SeqWorker {
|
||||
forwardPatternDir, err := obiapat.MakeApatPattern(forward, errormax, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot compile forward primer %s : %v", forward, err)
|
||||
}
|
||||
|
||||
reverse_rev := obiseq.NewBioSequence("fp", []byte(reverse), "").ReverseComplement(true).String()
|
||||
reveresePatternRev, err := obiapat.MakeApatPattern(reverse_rev, errormax, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot compile reverse complement reverse primer %s : %v", reverse, err)
|
||||
}
|
||||
|
||||
matchRead := func(sequence *obiseq.BioSequence) *obiseq.BioSequence {
|
||||
var aseq obiapat.ApatSequence
|
||||
var err error
|
||||
var read, match *obiseq.BioSequence
|
||||
|
||||
aseq, err = obiapat.MakeApatSequence(sequence, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot prepare apat sequence from %s : %v", sequence.Id(), err)
|
||||
}
|
||||
|
||||
start, end, nerr, matched := forwardPatternDir.BestMatch(aseq, 0, aseq.Len())
|
||||
|
||||
if matched {
|
||||
read = sequence
|
||||
|
||||
if cutReads {
|
||||
read, err = sequence.Subsequence(start, sequence.Len(), false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot cut, on forward, forward read %s [%d,%d] : %v",
|
||||
sequence.Id(), start, sequence.Len(), err)
|
||||
}
|
||||
}
|
||||
|
||||
read.SetAttribute("forward_primer", forward)
|
||||
match, _ = sequence.Subsequence(start, end, false)
|
||||
read.SetAttribute("forward_match", match.String())
|
||||
read.SetAttribute("forward_error", nerr)
|
||||
|
||||
aseq, err = obiapat.MakeApatSequence(read, false, aseq)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot prepare apat sequence from %s : %v", sequence.Id(), err)
|
||||
}
|
||||
|
||||
start, end, nerr, matched = reveresePatternRev.BestMatch(aseq, 0, aseq.Len())
|
||||
|
||||
if matched {
|
||||
|
||||
frread := read
|
||||
|
||||
if cutReads {
|
||||
frread, err = read.Subsequence(0, end, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot xxx cut, on reverse, forward read %s [%d,%d] : %v",
|
||||
sequence.Id(), start, read.Len(), err)
|
||||
}
|
||||
}
|
||||
|
||||
frread.SetAttribute("reverse_primer", reverse)
|
||||
match, _ = read.Subsequence(start, end, false)
|
||||
frread.SetAttribute("reverse_match", match.ReverseComplement(true).String())
|
||||
frread.SetAttribute("reverse_error", nerr)
|
||||
|
||||
read = frread
|
||||
// log.Warnf("Forward-Reverse primer matched on %s : %d\n%s", read.Id(), read.Len(),
|
||||
// obiformats.FormatFasta(read, obiformats.FormatFastSeqJsonHeader))
|
||||
}
|
||||
|
||||
} else {
|
||||
start, end, nerr, matched = reveresePatternRev.BestMatch(aseq, 0, aseq.Len())
|
||||
|
||||
if matched {
|
||||
read = sequence
|
||||
if cutReads {
|
||||
read, err = sequence.Subsequence(0, end, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot yyy cut, on reverse, forward read %s [%d,%d] : %v",
|
||||
sequence.Id(), 0, end, err)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
read.SetAttribute("reverse_primer", reverse)
|
||||
match, _ = read.Subsequence(start, end, false)
|
||||
read.SetAttribute("reverse_match", match.ReverseComplement(true).String())
|
||||
read.SetAttribute("reverse_error", nerr)
|
||||
} else {
|
||||
read = nil
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return read
|
||||
}
|
||||
|
||||
w := func(sequence *obiseq.BioSequence) (result obiseq.BioSequenceSlice, err error) {
|
||||
result = obiseq.MakeBioSequenceSlice()
|
||||
|
||||
paired := sequence.PairedWith()
|
||||
sequence.UnPair()
|
||||
|
||||
read := matchRead(sequence)
|
||||
|
||||
if read == nil {
|
||||
sequence = sequence.ReverseComplement(true)
|
||||
read = matchRead(sequence)
|
||||
}
|
||||
|
||||
if read != nil {
|
||||
result = append(result, read)
|
||||
}
|
||||
|
||||
if paired != nil {
|
||||
read = matchRead(paired)
|
||||
|
||||
if read == nil {
|
||||
read = matchRead(paired.ReverseComplement(true))
|
||||
}
|
||||
|
||||
if read != nil {
|
||||
result = append(result, read)
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
return w
|
||||
}
|
||||
|
||||
func ExtractOnPatterns(iter obiiter.IBioSequence,
|
||||
forward, reverse string,
|
||||
errormax int,
|
||||
cutReads bool,
|
||||
) obiseq.BioSequenceSlice {
|
||||
|
||||
matched := iter.MakeIWorker(
|
||||
BuildFilterOnPatternReadPairWorker(forward, reverse, errormax, cutReads),
|
||||
false,
|
||||
)
|
||||
|
||||
rep := obiseq.MakeBioSequenceSlice()
|
||||
|
||||
for matched.Next() {
|
||||
frgs := matched.Get()
|
||||
rep = append(rep, frgs.Slice()...)
|
||||
}
|
||||
|
||||
return rep
|
||||
}
|
||||
|
||||
func BuildPCRProduct(seqs obiseq.BioSequenceSlice,
|
||||
consensus_id string,
|
||||
kmer_size int,
|
||||
forward, reverse string,
|
||||
backtrack bool,
|
||||
save_graph bool, dirname string) (*obiseq.BioSequence, error) {
|
||||
|
||||
from := obiseq.NewBioSequence("forward", []byte(forward), "")
|
||||
to := obiseq.NewBioSequence("reverse", []byte(CLIReversePrimer()), "").ReverseComplement(true)
|
||||
|
||||
if backtrack {
|
||||
from, to = to, from
|
||||
}
|
||||
|
||||
if seqs.Len() == 0 {
|
||||
return nil, fmt.Errorf("no sequence provided")
|
||||
}
|
||||
|
||||
if save_graph {
|
||||
if dirname == "" {
|
||||
dirname = "."
|
||||
}
|
||||
|
||||
if stat, err := os.Stat(dirname); err != nil || !stat.IsDir() {
|
||||
// path does not exist or is not directory
|
||||
os.RemoveAll(dirname)
|
||||
err := os.Mkdir(dirname, 0755)
|
||||
|
||||
if err != nil {
|
||||
log.Panicf("Cannot create directory %s for saving graphs", dirname)
|
||||
}
|
||||
}
|
||||
|
||||
fasta, err := os.Create(path.Join(dirname, fmt.Sprintf("%s_consensus.fasta", consensus_id)))
|
||||
|
||||
if err == nil {
|
||||
defer fasta.Close()
|
||||
fasta.Write(obiformats.FormatFastaBatch(obiiter.MakeBioSequenceBatch(
|
||||
fmt.Sprintf("%s_consensus", consensus_id),
|
||||
0,
|
||||
seqs,
|
||||
),
|
||||
obiformats.FormatFastSeqJsonHeader, false).Bytes())
|
||||
fasta.Close()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
log.Debugf("Number of reads : %d\n", len(seqs))
|
||||
|
||||
if kmer_size < 0 {
|
||||
longest := make([]int, len(seqs))
|
||||
|
||||
for i, seq := range seqs {
|
||||
s := obiseq.BioSequenceSlice{seq}
|
||||
sa := obisuffix.BuildSuffixArray(&s)
|
||||
longest[i] = slices.Max(sa.CommonSuffix())
|
||||
}
|
||||
|
||||
// spectrum := map[int]int{}
|
||||
// for _, s := range longest {
|
||||
// spectrum[s]++
|
||||
// }
|
||||
|
||||
// log.Warnf("spectum kmer size : %v", spectrum)
|
||||
|
||||
kmer_size = slices.Max(longest) + 1
|
||||
log.Infof("estimated kmer size : %d", kmer_size)
|
||||
}
|
||||
|
||||
var graph *obikmer.DeBruijnGraph
|
||||
|
||||
var hp []uint64
|
||||
var err error
|
||||
var starts []uint64
|
||||
var stops []uint64
|
||||
|
||||
for {
|
||||
graph = obikmer.MakeDeBruijnGraph(kmer_size)
|
||||
|
||||
for _, s := range seqs {
|
||||
graph.Push(s)
|
||||
}
|
||||
|
||||
if !backtrack {
|
||||
starts = graph.Search(from, CLIAllowedMismatch())
|
||||
stops = graph.BackSearch(to, CLIAllowedMismatch())
|
||||
} else {
|
||||
starts = graph.BackSearch(from, CLIAllowedMismatch())
|
||||
stops = graph.Search(to, CLIAllowedMismatch())
|
||||
}
|
||||
|
||||
log.Infof("Found %d starts", len(starts))
|
||||
pweight := map[int]int{}
|
||||
for _, s := range starts {
|
||||
w := graph.Weight(s)
|
||||
pweight[w]++
|
||||
log.Warnf("Starts : %s (%d)\n", graph.DecodeNode(s), w)
|
||||
}
|
||||
|
||||
log.Infof("Found %d stops", len(stops))
|
||||
for _, s := range stops {
|
||||
w := graph.Weight(s)
|
||||
pweight[w]++
|
||||
log.Warnf("Stop : %s (%d)\n", graph.DecodeNode(s), w)
|
||||
}
|
||||
|
||||
log.Infof("Weight spectrum : %v", pweight)
|
||||
|
||||
wmax := 0
|
||||
sw := 0
|
||||
for w := range pweight {
|
||||
sw += w
|
||||
if w > wmax {
|
||||
wmax = w
|
||||
}
|
||||
}
|
||||
|
||||
graph.FilterMinWeight(int(sw / len(pweight)))
|
||||
graph.FilterMaxWeight(int(wmax * 2))
|
||||
|
||||
log.Infof("Minimum coverage : %d", int(sw/len(pweight)))
|
||||
log.Infof("Maximum coverage : %d", int(wmax*2))
|
||||
|
||||
if !graph.HasCycleInDegree() {
|
||||
break
|
||||
}
|
||||
|
||||
kmer_size++
|
||||
|
||||
if kmer_size > 31 {
|
||||
break
|
||||
}
|
||||
|
||||
SetKmerSize(kmer_size)
|
||||
log.Warnf("Cycle detected, increasing kmer size to %d\n", kmer_size)
|
||||
}
|
||||
|
||||
if !backtrack {
|
||||
starts = graph.Search(from, CLIAllowedMismatch())
|
||||
stops = graph.BackSearch(to, CLIAllowedMismatch())
|
||||
} else {
|
||||
starts = graph.BackSearch(from, CLIAllowedMismatch())
|
||||
stops = graph.Search(to, CLIAllowedMismatch())
|
||||
}
|
||||
|
||||
hp, err = graph.HaviestPath(starts, stops, backtrack)
|
||||
|
||||
log.Debugf("Graph size : %d\n", graph.Len())
|
||||
|
||||
maxw := graph.MaxWeight()
|
||||
modew := graph.WeightMode()
|
||||
meanw := graph.WeightMean()
|
||||
specw := graph.WeightSpectrum()
|
||||
kmer := graph.KmerSize()
|
||||
|
||||
log.Warnf("Weigh mode: %d Weigth mean : %4.1f Weigth max : %d, kmer = %d", modew, meanw, maxw, kmer)
|
||||
log.Warn(specw)
|
||||
|
||||
if save_graph {
|
||||
|
||||
file, err := os.Create(path.Join(dirname,
|
||||
fmt.Sprintf("%s_consensus.gml", consensus_id)))
|
||||
|
||||
if err != nil {
|
||||
fmt.Println(err)
|
||||
} else {
|
||||
file.WriteString(graph.Gml())
|
||||
file.Close()
|
||||
}
|
||||
}
|
||||
|
||||
if err == nil {
|
||||
s := graph.DecodePath(hp)
|
||||
|
||||
seq := obiseq.NewBioSequence(consensus_id, []byte(s), "")
|
||||
|
||||
total_kmer := graph.Len()
|
||||
sumCount := 0
|
||||
|
||||
if seq != nil {
|
||||
for _, s := range seqs {
|
||||
sumCount += s.Count()
|
||||
}
|
||||
seq.SetAttribute("obiconsensus_consensus", true)
|
||||
seq.SetAttribute("obiconsensus_weight", sumCount)
|
||||
seq.SetAttribute("obiconsensus_seq_length", seq.Len())
|
||||
seq.SetAttribute("obiconsensus_kmer_size", kmer_size)
|
||||
seq.SetAttribute("obiconsensus_kmer_max_occur", graph.MaxWeight())
|
||||
seq.SetAttribute("obiconsensus_filtered_graph_size", graph.Len())
|
||||
seq.SetAttribute("obiconsensus_full_graph_size", total_kmer)
|
||||
}
|
||||
|
||||
log.Warnf("Consensus sequence : \n%s", obiformats.FormatFasta(seq, obiformats.FormatFastSeqJsonHeader))
|
||||
|
||||
return seq, nil
|
||||
|
||||
}
|
||||
|
||||
return nil, err
|
||||
}
|
||||
|
||||
func CLIAssemblePCR() *obiseq.BioSequence {
|
||||
|
||||
pairs, err := CLIPairedSequence()
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Cannot open file (%v)", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
matched := ExtractOnPatterns(pairs,
|
||||
CLIForwardPrimer(),
|
||||
CLIReversePrimer(),
|
||||
CLIAllowedMismatch(),
|
||||
true,
|
||||
)
|
||||
|
||||
seq, err := BuildPCRProduct(
|
||||
matched,
|
||||
CLIGraphFilesDirectory(),
|
||||
CLIKmerSize(),
|
||||
CLIForwardPrimer(),
|
||||
CLIReversePrimer(),
|
||||
false,
|
||||
CLISaveGraphToFiles(),
|
||||
CLIGraphFilesDirectory())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot build the consensus sequence : %v", err)
|
||||
|
||||
}
|
||||
|
||||
forwardPatternDir, err := obiapat.MakeApatPattern(
|
||||
CLIForwardPrimer(),
|
||||
CLIAllowedMismatch(),
|
||||
false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot compile forward primer %s : %v", CLIForwardPrimer(), err)
|
||||
}
|
||||
|
||||
reverse_rev := obiseq.NewBioSequence("fp", []byte(CLIReversePrimer()), "").ReverseComplement(true).String()
|
||||
reveresePatternRev, err := obiapat.MakeApatPattern(reverse_rev, CLIAllowedMismatch(), false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot compile reverse complement reverse primer %s : %v", CLIReversePrimer(), err)
|
||||
}
|
||||
|
||||
aseq, err := obiapat.MakeApatSequence(seq, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot build apat sequence: %v", err)
|
||||
}
|
||||
|
||||
fstart, fend, fnerr, hasfw := forwardPatternDir.BestMatch(aseq, 0, aseq.Len())
|
||||
rstart, rend, rnerr, hasrev := reveresePatternRev.BestMatch(aseq, 0, aseq.Len())
|
||||
|
||||
for hasfw && !hasrev {
|
||||
var rseq *obiseq.BioSequence
|
||||
rseq, err = BuildPCRProduct(
|
||||
matched,
|
||||
CLIGraphFilesDirectory(),
|
||||
CLIKmerSize(),
|
||||
CLIForwardPrimer(),
|
||||
CLIReversePrimer(),
|
||||
true,
|
||||
CLISaveGraphToFiles(),
|
||||
CLIGraphFilesDirectory())
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot build Reverse PCR sequence: %v", err)
|
||||
}
|
||||
|
||||
kmerSize, _ := seq.GetIntAttribute("obiconsensus_kmer_size")
|
||||
fp, _ := seq.Subsequence(seq.Len()-kmerSize, seq.Len(), false)
|
||||
rp, _ := rseq.Subsequence(0, kmerSize, false)
|
||||
rp = rp.ReverseComplement(true)
|
||||
|
||||
pairs, err := CLIPairedSequence()
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("Cannot open file (%v)", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
nmatched := ExtractOnPatterns(pairs,
|
||||
fp.String(),
|
||||
rp.String(),
|
||||
CLIAllowedMismatch(),
|
||||
true,
|
||||
)
|
||||
|
||||
in := map[string]bool{}
|
||||
|
||||
for _, s := range matched {
|
||||
in[s.String()] = true
|
||||
}
|
||||
|
||||
for _, s := range nmatched {
|
||||
if !in[s.String()] {
|
||||
matched = append(matched, s)
|
||||
}
|
||||
}
|
||||
|
||||
seq, err = BuildPCRProduct(
|
||||
matched,
|
||||
CLIGraphFilesDirectory(),
|
||||
CLIKmerSize(),
|
||||
CLIForwardPrimer(),
|
||||
CLIReversePrimer(),
|
||||
false,
|
||||
CLISaveGraphToFiles(),
|
||||
CLIGraphFilesDirectory())
|
||||
|
||||
aseq, err := obiapat.MakeApatSequence(seq, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("Cannot build apat sequence: %v", err)
|
||||
}
|
||||
fstart, fend, fnerr, hasfw = forwardPatternDir.BestMatch(aseq, 0, aseq.Len())
|
||||
rstart, rend, rnerr, hasrev = reveresePatternRev.BestMatch(aseq, 0, aseq.Len())
|
||||
|
||||
}
|
||||
|
||||
marker, _ := seq.Subsequence(fstart, rend, false)
|
||||
|
||||
marker.SetAttribute("forward_primer", CLIForwardPrimer())
|
||||
match, _ := seq.Subsequence(fstart, fend, false)
|
||||
marker.SetAttribute("forward_match", match.String())
|
||||
marker.SetAttribute("forward_error", fnerr)
|
||||
|
||||
marker.SetAttribute("reverse_primer", CLIReversePrimer())
|
||||
match, _ = seq.Subsequence(rstart, rend, false)
|
||||
marker.SetAttribute("reverse_match", match.ReverseComplement(true).String())
|
||||
marker.SetAttribute("reverse_error", rnerr)
|
||||
|
||||
return marker
|
||||
}
|
||||
139
pkg/obitools/obimicroasm/options.go
Normal file
139
pkg/obitools/obimicroasm/options.go
Normal file
@@ -0,0 +1,139 @@
|
||||
package obimicroasm
|
||||
|
||||
import (
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiapat"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obiiter"
|
||||
"git.metabarcoding.org/obitools/obitools4/obitools4/pkg/obitools/obiconvert"
|
||||
"github.com/DavidGamba/go-getoptions"
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
var _ForwardFile = ""
|
||||
var _ReverseFile = ""
|
||||
var _ForwardPrimer string
|
||||
var _ReversePrimer string
|
||||
var _AllowedMismatch = 0
|
||||
var _kmerSize = -1
|
||||
|
||||
var _saveGraph = "__@@NOSAVE@@__"
|
||||
|
||||
func MicroAsmOptionSet(options *getoptions.GetOpt) {
|
||||
options.StringVar(&_ForwardFile, "forward-reads", "",
|
||||
options.Alias("F"),
|
||||
options.ArgName("FILENAME_F"),
|
||||
options.Required("You must provide at a forward file"),
|
||||
options.Description("The file names containing the forward reads"))
|
||||
options.StringVar(&_ReverseFile, "reverse-reads", "",
|
||||
options.Alias("R"),
|
||||
options.ArgName("FILENAME_R"),
|
||||
options.Required("You must provide a reverse file"),
|
||||
options.Description("The file names containing the reverse reads"))
|
||||
options.StringVar(&_ForwardPrimer, "forward", "",
|
||||
options.Required("You must provide a forward primer"),
|
||||
options.Description("The forward primer used for the electronic PCR."))
|
||||
|
||||
options.StringVar(&_ReversePrimer, "reverse", "",
|
||||
options.Required("You must provide a reverse primer"),
|
||||
options.Description("The reverse primer used for the electronic PCR."))
|
||||
|
||||
options.IntVar(&_AllowedMismatch, "allowed-mismatches", 0,
|
||||
options.Alias("e"),
|
||||
options.Description("Maximum number of mismatches allowed for each primer."))
|
||||
options.IntVar(&_kmerSize, "kmer-size", _kmerSize,
|
||||
options.ArgName("SIZE"),
|
||||
options.Description("The size of the kmer used to build the consensus. "+
|
||||
"Default value = -1, which means that the kmer size is estimated from the data"),
|
||||
)
|
||||
|
||||
options.StringVar(&_saveGraph, "save-graph", _saveGraph,
|
||||
options.Description("Creates a directory containing the set of DAG used by the obiclean clustering algorithm. "+
|
||||
"The graph files follow the graphml format."),
|
||||
)
|
||||
|
||||
}
|
||||
|
||||
func OptionSet(options *getoptions.GetOpt) {
|
||||
obiconvert.OptionSet(options)
|
||||
MicroAsmOptionSet(options)
|
||||
}
|
||||
|
||||
// CLIForwardPrimer returns the sequence of the forward primer as indicated by the
|
||||
// --forward command line option
|
||||
func CLIForwardPrimer() string {
|
||||
pattern, err := obiapat.MakeApatPattern(_ForwardPrimer, _AllowedMismatch, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("%+v", err)
|
||||
}
|
||||
|
||||
pattern.Free()
|
||||
|
||||
return _ForwardPrimer
|
||||
}
|
||||
|
||||
// CLIReversePrimer returns the sequence of the reverse primer as indicated by the
|
||||
// --reverse command line option
|
||||
func CLIReversePrimer() string {
|
||||
pattern, err := obiapat.MakeApatPattern(_ReversePrimer, _AllowedMismatch, false)
|
||||
|
||||
if err != nil {
|
||||
log.Fatalf("%+v", err)
|
||||
}
|
||||
|
||||
pattern.Free()
|
||||
|
||||
return _ReversePrimer
|
||||
}
|
||||
|
||||
// CLIAllowedMismatch returns the allowed mistmatch count between each
|
||||
// primer and the sequences as indicated by the
|
||||
// --allowed-mismatches|-e command line option
|
||||
func CLIAllowedMismatch() int {
|
||||
return _AllowedMismatch
|
||||
}
|
||||
|
||||
func CLIPairedSequence() (obiiter.IBioSequence, error) {
|
||||
forward, err := obiconvert.CLIReadBioSequences(_ForwardFile)
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
reverse, err := obiconvert.CLIReadBioSequences(_ReverseFile)
|
||||
if err != nil {
|
||||
return obiiter.NilIBioSequence, err
|
||||
}
|
||||
|
||||
paired := forward.PairTo(reverse)
|
||||
|
||||
return paired, nil
|
||||
}
|
||||
|
||||
func CLIForwardFile() string {
|
||||
return _ForwardFile
|
||||
}
|
||||
|
||||
// Returns true it the obliclean graphs must be saved
|
||||
func CLISaveGraphToFiles() bool {
|
||||
return _saveGraph != "__@@NOSAVE@@__"
|
||||
}
|
||||
|
||||
// It returns the directory where the graph files are saved
|
||||
func CLIGraphFilesDirectory() string {
|
||||
return _saveGraph
|
||||
}
|
||||
|
||||
// CLIKmerSize returns the value of the kmer size to use for building the consensus.
|
||||
//
|
||||
// The value of the kmer size is set by the user with the `-k` flag.
|
||||
// The value -1 means that the kmer size is estimated as the minimum value that
|
||||
// insure that no kmer are present more than one time in a sequence.
|
||||
//
|
||||
// No parameters.
|
||||
// Returns an integer value.
|
||||
func CLIKmerSize() int {
|
||||
return _kmerSize
|
||||
}
|
||||
|
||||
func SetKmerSize(kmerSize int) {
|
||||
_kmerSize = kmerSize
|
||||
}
|
||||
@@ -129,7 +129,6 @@ func AssemblePESequences(seqA, seqB *obiseq.BioSequence,
|
||||
}
|
||||
lcons := cons.Len()
|
||||
aliLength := lcons - _Abs(left) - _Abs(right)
|
||||
|
||||
identity := float64(match) / float64(aliLength)
|
||||
if aliLength == 0 {
|
||||
identity = 0
|
||||
@@ -238,7 +237,7 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
|
||||
log.Printf("End of the sequence Pairing")
|
||||
}()
|
||||
|
||||
f := func(iterator obiiter.IBioSequence) {
|
||||
f := func(iterator obiiter.IBioSequence, wid int) {
|
||||
arena := obialign.MakePEAlignArena(150, 150)
|
||||
shifts := make(map[int]int)
|
||||
|
||||
@@ -263,9 +262,9 @@ func IAssemblePESequencesBatch(iterator obiiter.IBioSequence,
|
||||
log.Printf("Start of the sequence Pairing using %d workers\n", nworkers)
|
||||
|
||||
for i := 0; i < nworkers-1; i++ {
|
||||
go f(iterator.Split())
|
||||
go f(iterator.Split(), i)
|
||||
}
|
||||
go f(iterator)
|
||||
go f(iterator, nworkers-1)
|
||||
return newIter
|
||||
|
||||
}
|
||||
|
||||
@@ -73,10 +73,6 @@ func FindClosests(sequence *obiseq.BioSequence,
|
||||
refcounts []*obikmer.Table4mer,
|
||||
runExact bool) (obiseq.BioSequenceSlice, int, float64, string, []int) {
|
||||
|
||||
if sequence.Len() < 5 {
|
||||
return obiseq.BioSequenceSlice{}, 1000, 0, "NA", []int{}
|
||||
}
|
||||
|
||||
var matrix []uint64
|
||||
|
||||
seqwords := obikmer.Count4Mer(sequence, nil, nil)
|
||||
@@ -260,7 +256,7 @@ func CLIAssignTaxonomy(iterator obiiter.IBioSequence,
|
||||
if taxon != nil {
|
||||
j++
|
||||
} else {
|
||||
log.Warnf("Taxid %s is not described in the taxonomy %s."+
|
||||
log.Warnf("Taxid %d is not described in the taxonomy %s."+
|
||||
" Sequence %s is discared from the reference database",
|
||||
seq.Taxid(), taxo.Name(), seq.Id())
|
||||
}
|
||||
|
||||
@@ -23,7 +23,7 @@ func MakeSet[E comparable](vals ...E) Set[E] {
|
||||
// It takes a variadic parameter of type E, where E is a comparable type.
|
||||
// It returns a pointer to a Set of type E.
|
||||
func NewSet[E comparable](vals ...E) *Set[E] {
|
||||
s := MakeSet(vals...)
|
||||
s := MakeSet[E](vals...)
|
||||
return &s
|
||||
}
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ func TestNewSet(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test Case 2: Creating a set with multiple values
|
||||
set2 := NewSet("apple", "banana", "cherry")
|
||||
set2 := NewSet[string]("apple", "banana", "cherry")
|
||||
if len(*set2) != 3 {
|
||||
t.Errorf("Expected size to be 3, but got %d", len(*set2))
|
||||
}
|
||||
@@ -147,7 +147,7 @@ func TestMembers(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test case 2: Set with multiple elements
|
||||
set = MakeSet(1, 2, 3)
|
||||
set = MakeSet[int](1, 2, 3)
|
||||
expected = []int{1, 2, 3}
|
||||
actual = set.Members()
|
||||
sort.Ints(actual)
|
||||
@@ -172,7 +172,7 @@ func TestSetString(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test set with single member
|
||||
singleMemberSet := NewSet(42)
|
||||
singleMemberSet := NewSet[int](42)
|
||||
singleMemberSetString := singleMemberSet.String()
|
||||
expectedSingleMemberSetString := "[42]"
|
||||
if singleMemberSetString != expectedSingleMemberSetString {
|
||||
@@ -180,7 +180,7 @@ func TestSetString(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test set with multiple members
|
||||
multipleMembersSet := NewSet(1, 2, 3)
|
||||
multipleMembersSet := NewSet[int](1, 2, 3)
|
||||
multipleMembersSetString := multipleMembersSet.String()
|
||||
expectedMultipleMembersSetString := "[1 2 3]"
|
||||
if multipleMembersSetString != expectedMultipleMembersSetString {
|
||||
@@ -213,26 +213,26 @@ func TestUnion(t *testing.T) {
|
||||
|
||||
// Test case 2: Union of an empty set and a non-empty set should return the non-empty set
|
||||
set1 = MakeSet[int]()
|
||||
set2 = MakeSet(1, 2, 3)
|
||||
expected = MakeSet(1, 2, 3)
|
||||
set2 = MakeSet[int](1, 2, 3)
|
||||
expected = MakeSet[int](1, 2, 3)
|
||||
result = set1.Union(set2)
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
t.Errorf("Expected %v, but got %v", expected, result)
|
||||
}
|
||||
|
||||
// Test case 3: Union of two non-empty sets with common elements should return a set with unique elements
|
||||
set1 = MakeSet(1, 2, 3)
|
||||
set2 = MakeSet(2, 3, 4)
|
||||
expected = MakeSet(1, 2, 3, 4)
|
||||
set1 = MakeSet[int](1, 2, 3)
|
||||
set2 = MakeSet[int](2, 3, 4)
|
||||
expected = MakeSet[int](1, 2, 3, 4)
|
||||
result = set1.Union(set2)
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
t.Errorf("Expected %v, but got %v", expected, result)
|
||||
}
|
||||
|
||||
// Test case 4: Union of two non-empty sets with no common elements should return a set with all elements
|
||||
set1 = MakeSet(1, 2, 3)
|
||||
set2 = MakeSet(4, 5, 6)
|
||||
expected = MakeSet(1, 2, 3, 4, 5, 6)
|
||||
set1 = MakeSet[int](1, 2, 3)
|
||||
set2 = MakeSet[int](4, 5, 6)
|
||||
expected = MakeSet[int](1, 2, 3, 4, 5, 6)
|
||||
result = set1.Union(set2)
|
||||
if !reflect.DeepEqual(result, expected) {
|
||||
t.Errorf("Expected %v, but got %v", expected, result)
|
||||
|
||||
20
pkg/obiutils/unique.go
Normal file
20
pkg/obiutils/unique.go
Normal file
@@ -0,0 +1,20 @@
|
||||
package obiutils
|
||||
|
||||
// Unique returns a new slice containing only unique values from the input slice.
|
||||
// The order of elements in the output slice is not guaranteed to match the input order.
|
||||
//
|
||||
// Parameters:
|
||||
// - slice: The input slice containing potentially duplicate values
|
||||
//
|
||||
// Returns:
|
||||
// - A new slice containing only unique values
|
||||
func Unique[T comparable](slice []T) []T {
|
||||
// Create a map to track unique values
|
||||
seen := Set[T]{}
|
||||
|
||||
for _, v := range slice {
|
||||
seen.Add(v)
|
||||
}
|
||||
|
||||
return seen.Members()
|
||||
}
|
||||
Reference in New Issue
Block a user