first implementation but far to be optimal
This commit is contained in:
@@ -0,0 +1,243 @@
|
||||
%% This BibTeX bibliography file was created using BibDesk.
|
||||
%% https://bibdesk.sourceforge.io/
|
||||
|
||||
%% Created for Eric Coissac at 2026-04-18 08:19:36 +0200
|
||||
|
||||
|
||||
%% Saved with string encoding Unicode (UTF-8)
|
||||
|
||||
|
||||
|
||||
@article{Zheng2020-ji,
|
||||
abstract = {MOTIVATION: Minimizers are methods to sample k-mers from a
|
||||
string, with the guarantee that similar set of k-mers will be
|
||||
chosen on similar strings. It is parameterized by the k-mer
|
||||
length k, a window length w and an order on the k-mers.
|
||||
Minimizers are used in a large number of softwares and pipelines
|
||||
to improve computation efficiency and decrease memory usage.
|
||||
Despite the method's popularity, many theoretical questions
|
||||
regarding its performance remain open. The core metric for
|
||||
measuring performance of a minimizer is the density, which
|
||||
measures the sparsity of sampled k-mers. The theoretical optimal
|
||||
density for a minimizer is 1/w, provably not achievable in
|
||||
general. For given k and w, little is known about asymptotically
|
||||
optimal minimizers, that is minimizers with density O(1/w).
|
||||
RESULTS: We derive a necessary and sufficient condition for
|
||||
existence of asymptotically optimal minimizers. We also provide a
|
||||
randomized algorithm, called the Miniception, to design
|
||||
minimizers with the best theoretical guarantee to date on density
|
||||
in practical scenarios. Constructing and using the Miniception is
|
||||
as easy as constructing and using a random minimizer, which
|
||||
allows the design of efficient minimizers that scale to the
|
||||
values of k and w used in current bioinformatics software
|
||||
programs. AVAILABILITY AND IMPLEMENTATION: Reference
|
||||
implementation of the Miniception and the codes for analysis can
|
||||
be found at https://github.com/kingsford-group/miniception.
|
||||
SUPPLEMENTARY INFORMATION: Supplementary data are available at
|
||||
Bioinformatics online.},
|
||||
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
|
||||
doi = {10.1093/bioinformatics/btaa472},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = jul,
|
||||
number = {Suppl_1},
|
||||
pages = {i119--i127},
|
||||
pmc = {PMC8248892},
|
||||
pmid = 32657376,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {Improved design and analysis of practical minimizers},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btaa472},
|
||||
volume = 36,
|
||||
year = 2020,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btaa472}}
|
||||
|
||||
@article{Zheng2021-cc,
|
||||
abstract = {MOTIVATION: Minimizers are efficient methods to sample k-mers
|
||||
from genomic sequences that unconditionally preserve sufficiently
|
||||
long matches between sequences. Well-established methods to
|
||||
construct efficient minimizers focus on sampling fewer k-mers on
|
||||
a random sequence and use universal hitting sets (sets of k-mers
|
||||
that appear frequently enough) to upper bound the sketch size. In
|
||||
contrast, the problem of sequence-specific minimizers, which is
|
||||
to construct efficient minimizers to sample fewer k-mers on a
|
||||
specific sequence such as the reference genome, is less studied.
|
||||
Currently, the theoretical understanding of this problem is
|
||||
lacking, and existing methods do not specialize well to sketch
|
||||
specific sequences. RESULTS: We propose the concept of polar
|
||||
sets, complementary to the existing idea of universal hitting
|
||||
sets. Polar sets are k-mer sets that are spread out enough on the
|
||||
reference, and provably specialize well to specific sequences.
|
||||
Link energy measures how well spread out a polar set is, and with
|
||||
it, the sketch size can be bounded from above and below in a
|
||||
theoretically sound way. This allows for direct optimization of
|
||||
sketch size. We propose efficient heuristics to construct polar
|
||||
sets, and via experiments on the human reference genome, show
|
||||
their practical superiority in designing efficient
|
||||
sequence-specific minimizers. AVAILABILITY AND IMPLEMENTATION: A
|
||||
reference implementation and code for analyses under an
|
||||
open-source license are at
|
||||
https://github.com/kingsford-group/polarset. SUPPLEMENTARY
|
||||
INFORMATION: Supplementary data are available at Bioinformatics
|
||||
online.},
|
||||
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
|
||||
doi = {10.1093/bioinformatics/btab313},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = jul,
|
||||
number = {Suppl\_1},
|
||||
pages = {i187--i195},
|
||||
pmc = {PMC8686682},
|
||||
pmid = 34252928,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {Sequence-specific minimizers via polar sets},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btab313},
|
||||
volume = 37,
|
||||
year = 2021,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btab313}}
|
||||
|
||||
@article{Pan2024-hb,
|
||||
abstract = {MOTIVATION: The minimizer concept is a data structure for
|
||||
sequence sketching. The standard canonical minimizer selects a
|
||||
subset of k-mers from the given DNA sequence by comparing the
|
||||
forward and reverse k-mers in a window simultaneously according
|
||||
to a predefined selection scheme. It is widely employed by
|
||||
sequence analysis such as read mapping and assembly. k-mer
|
||||
density, k-mer repetitiveness (e.g. k-mer bias), and
|
||||
computational efficiency are three critical measurements for
|
||||
minimizer selection schemes. However, there exist trade-offs
|
||||
between kinds of minimizer variants. Generic, effective, and
|
||||
efficient are always the requirements for high-performance
|
||||
minimizer algorithms. RESULTS: We propose a simple minimizer
|
||||
operator as a refinement of the standard canonical minimizer. It
|
||||
takes only a few operations to compute. However, it can improve
|
||||
the k-mer repetitiveness, especially for the lexicographic order.
|
||||
It applies to other selection schemes of total orders (e.g.
|
||||
random orders). Moreover, it is computationally efficient and the
|
||||
density is close to that of the standard minimizer. The refined
|
||||
minimizer may benefit high-performance applications like binning
|
||||
and read mapping. AVAILABILITY AND IMPLEMENTATION: The source
|
||||
code of the benchmark in this work is available at the github
|
||||
repository https://github.com/xp3i4/mini\_benchmark.},
|
||||
author = {Pan, Chenxu and Reinert, Knut},
|
||||
doi = {10.1093/bioinformatics/btae045},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = feb,
|
||||
number = 2,
|
||||
pmc = {PMC10868324},
|
||||
pmid = 38269626,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {A simple refined DNA minimizer operator enables 2-fold faster computation},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btae045},
|
||||
volume = 40,
|
||||
year = 2024,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btae045}}
|
||||
|
||||
@article{Kille2023-px,
|
||||
abstract = {MOTIVATION: The Jaccard similarity on k-mer sets has shown to be
|
||||
a convenient proxy for sequence identity. By avoiding expensive
|
||||
base-level alignments and comparing reduced sequence
|
||||
representations, tools such as MashMap can scale to massive
|
||||
numbers of pairwise comparisons while still providing useful
|
||||
similarity estimates. However, due to their reliance on minimizer
|
||||
winnowing, previous versions of MashMap were shown to be biased
|
||||
and inconsistent estimators of Jaccard similarity. This directly
|
||||
impacts downstream tools that rely on the accuracy of these
|
||||
estimates. RESULTS: To address this, we propose the minmer
|
||||
winnowing scheme, which generalizes the minimizer scheme by use
|
||||
of a rolling minhash with multiple sampled k-mers per window. We
|
||||
show both theoretically and empirically that minmers yield an
|
||||
unbiased estimator of local Jaccard similarity, and we implement
|
||||
this scheme in an updated version of MashMap. The minmer-based
|
||||
implementation is over 10 times faster than the minimizer-based
|
||||
version under the default ANI threshold, making it well-suited
|
||||
for large-scale comparative genomics applications. AVAILABILITY
|
||||
AND IMPLEMENTATION: MashMap3 is available at
|
||||
https://github.com/marbl/MashMap.},
|
||||
author = {Kille, Bryce and Garrison, Erik and Treangen, Todd J and Phillippy, Adam M},
|
||||
doi = {10.1093/bioinformatics/btad512},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = sep,
|
||||
number = 9,
|
||||
pmc = {PMC10505501},
|
||||
pmid = 37603771,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {Minmers are a generalization of minimizers that enable unbiased local Jaccard estimation},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btad512},
|
||||
volume = 39,
|
||||
year = 2023,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btad512}}
|
||||
|
||||
@incollection{Golan2025-xf,
|
||||
address = {Cham},
|
||||
author = {Golan, Shay and Shur, Arseny M},
|
||||
booktitle = {Lecture Notes in Computer Science},
|
||||
doi = {10.1007/978-3-031-82670-2\_25},
|
||||
isbn = {9783031826696,9783031826702},
|
||||
issn = {0302-9743,1611-3349},
|
||||
language = {en},
|
||||
pages = {347--360},
|
||||
publisher = {Springer Nature Switzerland},
|
||||
series = {Lecture Notes in Computer Science},
|
||||
title = {Expected density of random minimizers},
|
||||
url = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
|
||||
year = 2025,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
|
||||
bdsk-url-2 = {http://dx.doi.org/10.1007/978-3-031-82670-2%5C_25}}
|
||||
|
||||
@article{Mohamadi2017-ok,
|
||||
abstract = {Motivation: Many bioinformatics algorithms are designed for the
|
||||
analysis of sequences of some uniform length, conventionally
|
||||
referred to as k -mers. These include de Bruijn graph assembly
|
||||
methods and sequence alignment tools. An efficient algorithm to
|
||||
enumerate the number of unique k -mers, or even better, to build
|
||||
a histogram of k -mer frequencies would be desirable for these
|
||||
tools and their downstream analysis pipelines. Among other
|
||||
applications, estimated frequencies can be used to predict genome
|
||||
sizes, measure sequencing error rates, and tune runtime
|
||||
parameters for analysis tools. However, calculating a k -mer
|
||||
histogram from large volumes of sequencing data is a challenging
|
||||
task. Results: Here, we present ntCard, a streaming algorithm for
|
||||
estimating the frequencies of k -mers in genomics datasets. At
|
||||
its core, ntCard uses the ntHash algorithm to efficiently compute
|
||||
hash values for streamed sequences. It then samples the
|
||||
calculated hash values to build a reduced representation
|
||||
multiplicity table describing the sample distribution. Finally,
|
||||
it uses a statistical model to reconstruct the population
|
||||
distribution from the sample distribution. We have compared the
|
||||
performance of ntCard and other cardinality estimation
|
||||
algorithms. We used three datasets of 480 GB, 500 GB and 2.4 TB
|
||||
in size, where the first two representing whole genome shotgun
|
||||
sequencing experiments on the human genome and the last one on
|
||||
the white spruce genome. Results show ntCard estimates k -mer
|
||||
coverage frequencies >15× faster than the state-of-the-art
|
||||
algorithms, using similar amount of memory, and with higher
|
||||
accuracy rates. Thus, our benchmarks demonstrate ntCard as a
|
||||
potentially enabling technology for large-scale genomics
|
||||
applications. Availability and Implementation: ntCard is written
|
||||
in C ++ and is released under the GPL license. It is freely
|
||||
available at https://github.com/bcgsc/ntCard. Contact:
|
||||
hmohamadi@bcgsc.ca or ibirol@bcgsc.ca. Supplementary information:
|
||||
Supplementary data are available at Bioinformatics online.},
|
||||
author = {Mohamadi, Hamid and Khan, Hamza and Birol, Inanc},
|
||||
date-modified = {2026-04-18 08:19:36 +0200},
|
||||
doi = {10.1093/bioinformatics/btw832},
|
||||
issn = {1367-4803,1367-4811},
|
||||
journal = {Bioinformatics (Oxford, England)},
|
||||
language = {en},
|
||||
month = may,
|
||||
number = 9,
|
||||
pages = {1324--1330},
|
||||
pmc = {PMC5408799},
|
||||
pmid = 28453674,
|
||||
publisher = {Oxford University Press (OUP)},
|
||||
title = {ntCard: a streaming algorithm for cardinality estimation in genomics data},
|
||||
url = {http://dx.doi.org/10.1093/bioinformatics/btw832},
|
||||
volume = 33,
|
||||
year = 2017,
|
||||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btw832}}
|
||||
Reference in New Issue
Block a user