244 lines
13 KiB
BibTeX
244 lines
13 KiB
BibTeX
%% This BibTeX bibliography file was created using BibDesk.
|
||
%% https://bibdesk.sourceforge.io/
|
||
|
||
%% Created for Eric Coissac at 2026-04-18 08:19:36 +0200
|
||
|
||
|
||
%% Saved with string encoding Unicode (UTF-8)
|
||
|
||
|
||
|
||
@article{Zheng2020-ji,
|
||
abstract = {MOTIVATION: Minimizers are methods to sample k-mers from a
|
||
string, with the guarantee that similar set of k-mers will be
|
||
chosen on similar strings. It is parameterized by the k-mer
|
||
length k, a window length w and an order on the k-mers.
|
||
Minimizers are used in a large number of softwares and pipelines
|
||
to improve computation efficiency and decrease memory usage.
|
||
Despite the method's popularity, many theoretical questions
|
||
regarding its performance remain open. The core metric for
|
||
measuring performance of a minimizer is the density, which
|
||
measures the sparsity of sampled k-mers. The theoretical optimal
|
||
density for a minimizer is 1/w, provably not achievable in
|
||
general. For given k and w, little is known about asymptotically
|
||
optimal minimizers, that is minimizers with density O(1/w).
|
||
RESULTS: We derive a necessary and sufficient condition for
|
||
existence of asymptotically optimal minimizers. We also provide a
|
||
randomized algorithm, called the Miniception, to design
|
||
minimizers with the best theoretical guarantee to date on density
|
||
in practical scenarios. Constructing and using the Miniception is
|
||
as easy as constructing and using a random minimizer, which
|
||
allows the design of efficient minimizers that scale to the
|
||
values of k and w used in current bioinformatics software
|
||
programs. AVAILABILITY AND IMPLEMENTATION: Reference
|
||
implementation of the Miniception and the codes for analysis can
|
||
be found at https://github.com/kingsford-group/miniception.
|
||
SUPPLEMENTARY INFORMATION: Supplementary data are available at
|
||
Bioinformatics online.},
|
||
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
|
||
doi = {10.1093/bioinformatics/btaa472},
|
||
issn = {1367-4803,1367-4811},
|
||
journal = {Bioinformatics (Oxford, England)},
|
||
language = {en},
|
||
month = jul,
|
||
number = {Suppl_1},
|
||
pages = {i119--i127},
|
||
pmc = {PMC8248892},
|
||
pmid = 32657376,
|
||
publisher = {Oxford University Press (OUP)},
|
||
title = {Improved design and analysis of practical minimizers},
|
||
url = {http://dx.doi.org/10.1093/bioinformatics/btaa472},
|
||
volume = 36,
|
||
year = 2020,
|
||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btaa472}}
|
||
|
||
@article{Zheng2021-cc,
|
||
abstract = {MOTIVATION: Minimizers are efficient methods to sample k-mers
|
||
from genomic sequences that unconditionally preserve sufficiently
|
||
long matches between sequences. Well-established methods to
|
||
construct efficient minimizers focus on sampling fewer k-mers on
|
||
a random sequence and use universal hitting sets (sets of k-mers
|
||
that appear frequently enough) to upper bound the sketch size. In
|
||
contrast, the problem of sequence-specific minimizers, which is
|
||
to construct efficient minimizers to sample fewer k-mers on a
|
||
specific sequence such as the reference genome, is less studied.
|
||
Currently, the theoretical understanding of this problem is
|
||
lacking, and existing methods do not specialize well to sketch
|
||
specific sequences. RESULTS: We propose the concept of polar
|
||
sets, complementary to the existing idea of universal hitting
|
||
sets. Polar sets are k-mer sets that are spread out enough on the
|
||
reference, and provably specialize well to specific sequences.
|
||
Link energy measures how well spread out a polar set is, and with
|
||
it, the sketch size can be bounded from above and below in a
|
||
theoretically sound way. This allows for direct optimization of
|
||
sketch size. We propose efficient heuristics to construct polar
|
||
sets, and via experiments on the human reference genome, show
|
||
their practical superiority in designing efficient
|
||
sequence-specific minimizers. AVAILABILITY AND IMPLEMENTATION: A
|
||
reference implementation and code for analyses under an
|
||
open-source license are at
|
||
https://github.com/kingsford-group/polarset. SUPPLEMENTARY
|
||
INFORMATION: Supplementary data are available at Bioinformatics
|
||
online.},
|
||
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
|
||
doi = {10.1093/bioinformatics/btab313},
|
||
issn = {1367-4803,1367-4811},
|
||
journal = {Bioinformatics (Oxford, England)},
|
||
language = {en},
|
||
month = jul,
|
||
number = {Suppl\_1},
|
||
pages = {i187--i195},
|
||
pmc = {PMC8686682},
|
||
pmid = 34252928,
|
||
publisher = {Oxford University Press (OUP)},
|
||
title = {Sequence-specific minimizers via polar sets},
|
||
url = {http://dx.doi.org/10.1093/bioinformatics/btab313},
|
||
volume = 37,
|
||
year = 2021,
|
||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btab313}}
|
||
|
||
@article{Pan2024-hb,
|
||
abstract = {MOTIVATION: The minimizer concept is a data structure for
|
||
sequence sketching. The standard canonical minimizer selects a
|
||
subset of k-mers from the given DNA sequence by comparing the
|
||
forward and reverse k-mers in a window simultaneously according
|
||
to a predefined selection scheme. It is widely employed by
|
||
sequence analysis such as read mapping and assembly. k-mer
|
||
density, k-mer repetitiveness (e.g. k-mer bias), and
|
||
computational efficiency are three critical measurements for
|
||
minimizer selection schemes. However, there exist trade-offs
|
||
between kinds of minimizer variants. Generic, effective, and
|
||
efficient are always the requirements for high-performance
|
||
minimizer algorithms. RESULTS: We propose a simple minimizer
|
||
operator as a refinement of the standard canonical minimizer. It
|
||
takes only a few operations to compute. However, it can improve
|
||
the k-mer repetitiveness, especially for the lexicographic order.
|
||
It applies to other selection schemes of total orders (e.g.
|
||
random orders). Moreover, it is computationally efficient and the
|
||
density is close to that of the standard minimizer. The refined
|
||
minimizer may benefit high-performance applications like binning
|
||
and read mapping. AVAILABILITY AND IMPLEMENTATION: The source
|
||
code of the benchmark in this work is available at the github
|
||
repository https://github.com/xp3i4/mini\_benchmark.},
|
||
author = {Pan, Chenxu and Reinert, Knut},
|
||
doi = {10.1093/bioinformatics/btae045},
|
||
issn = {1367-4803,1367-4811},
|
||
journal = {Bioinformatics (Oxford, England)},
|
||
language = {en},
|
||
month = feb,
|
||
number = 2,
|
||
pmc = {PMC10868324},
|
||
pmid = 38269626,
|
||
publisher = {Oxford University Press (OUP)},
|
||
title = {A simple refined DNA minimizer operator enables 2-fold faster computation},
|
||
url = {http://dx.doi.org/10.1093/bioinformatics/btae045},
|
||
volume = 40,
|
||
year = 2024,
|
||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btae045}}
|
||
|
||
@article{Kille2023-px,
|
||
abstract = {MOTIVATION: The Jaccard similarity on k-mer sets has shown to be
|
||
a convenient proxy for sequence identity. By avoiding expensive
|
||
base-level alignments and comparing reduced sequence
|
||
representations, tools such as MashMap can scale to massive
|
||
numbers of pairwise comparisons while still providing useful
|
||
similarity estimates. However, due to their reliance on minimizer
|
||
winnowing, previous versions of MashMap were shown to be biased
|
||
and inconsistent estimators of Jaccard similarity. This directly
|
||
impacts downstream tools that rely on the accuracy of these
|
||
estimates. RESULTS: To address this, we propose the minmer
|
||
winnowing scheme, which generalizes the minimizer scheme by use
|
||
of a rolling minhash with multiple sampled k-mers per window. We
|
||
show both theoretically and empirically that minmers yield an
|
||
unbiased estimator of local Jaccard similarity, and we implement
|
||
this scheme in an updated version of MashMap. The minmer-based
|
||
implementation is over 10 times faster than the minimizer-based
|
||
version under the default ANI threshold, making it well-suited
|
||
for large-scale comparative genomics applications. AVAILABILITY
|
||
AND IMPLEMENTATION: MashMap3 is available at
|
||
https://github.com/marbl/MashMap.},
|
||
author = {Kille, Bryce and Garrison, Erik and Treangen, Todd J and Phillippy, Adam M},
|
||
doi = {10.1093/bioinformatics/btad512},
|
||
issn = {1367-4803,1367-4811},
|
||
journal = {Bioinformatics (Oxford, England)},
|
||
language = {en},
|
||
month = sep,
|
||
number = 9,
|
||
pmc = {PMC10505501},
|
||
pmid = 37603771,
|
||
publisher = {Oxford University Press (OUP)},
|
||
title = {Minmers are a generalization of minimizers that enable unbiased local Jaccard estimation},
|
||
url = {http://dx.doi.org/10.1093/bioinformatics/btad512},
|
||
volume = 39,
|
||
year = 2023,
|
||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btad512}}
|
||
|
||
@incollection{Golan2025-xf,
|
||
address = {Cham},
|
||
author = {Golan, Shay and Shur, Arseny M},
|
||
booktitle = {Lecture Notes in Computer Science},
|
||
doi = {10.1007/978-3-031-82670-2\_25},
|
||
isbn = {9783031826696,9783031826702},
|
||
issn = {0302-9743,1611-3349},
|
||
language = {en},
|
||
pages = {347--360},
|
||
publisher = {Springer Nature Switzerland},
|
||
series = {Lecture Notes in Computer Science},
|
||
title = {Expected density of random minimizers},
|
||
url = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
|
||
year = 2025,
|
||
bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
|
||
bdsk-url-2 = {http://dx.doi.org/10.1007/978-3-031-82670-2%5C_25}}
|
||
|
||
@article{Mohamadi2017-ok,
|
||
abstract = {Motivation: Many bioinformatics algorithms are designed for the
|
||
analysis of sequences of some uniform length, conventionally
|
||
referred to as k -mers. These include de Bruijn graph assembly
|
||
methods and sequence alignment tools. An efficient algorithm to
|
||
enumerate the number of unique k -mers, or even better, to build
|
||
a histogram of k -mer frequencies would be desirable for these
|
||
tools and their downstream analysis pipelines. Among other
|
||
applications, estimated frequencies can be used to predict genome
|
||
sizes, measure sequencing error rates, and tune runtime
|
||
parameters for analysis tools. However, calculating a k -mer
|
||
histogram from large volumes of sequencing data is a challenging
|
||
task. Results: Here, we present ntCard, a streaming algorithm for
|
||
estimating the frequencies of k -mers in genomics datasets. At
|
||
its core, ntCard uses the ntHash algorithm to efficiently compute
|
||
hash values for streamed sequences. It then samples the
|
||
calculated hash values to build a reduced representation
|
||
multiplicity table describing the sample distribution. Finally,
|
||
it uses a statistical model to reconstruct the population
|
||
distribution from the sample distribution. We have compared the
|
||
performance of ntCard and other cardinality estimation
|
||
algorithms. We used three datasets of 480 GB, 500 GB and 2.4 TB
|
||
in size, where the first two representing whole genome shotgun
|
||
sequencing experiments on the human genome and the last one on
|
||
the white spruce genome. Results show ntCard estimates k -mer
|
||
coverage frequencies >15× faster than the state-of-the-art
|
||
algorithms, using similar amount of memory, and with higher
|
||
accuracy rates. Thus, our benchmarks demonstrate ntCard as a
|
||
potentially enabling technology for large-scale genomics
|
||
applications. Availability and Implementation: ntCard is written
|
||
in C ++ and is released under the GPL license. It is freely
|
||
available at https://github.com/bcgsc/ntCard. Contact:
|
||
hmohamadi@bcgsc.ca or ibirol@bcgsc.ca. Supplementary information:
|
||
Supplementary data are available at Bioinformatics online.},
|
||
author = {Mohamadi, Hamid and Khan, Hamza and Birol, Inanc},
|
||
date-modified = {2026-04-18 08:19:36 +0200},
|
||
doi = {10.1093/bioinformatics/btw832},
|
||
issn = {1367-4803,1367-4811},
|
||
journal = {Bioinformatics (Oxford, England)},
|
||
language = {en},
|
||
month = may,
|
||
number = 9,
|
||
pages = {1324--1330},
|
||
pmc = {PMC5408799},
|
||
pmid = 28453674,
|
||
publisher = {Oxford University Press (OUP)},
|
||
title = {ntCard: a streaming algorithm for cardinality estimation in genomics data},
|
||
url = {http://dx.doi.org/10.1093/bioinformatics/btw832},
|
||
volume = 33,
|
||
year = 2017,
|
||
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btw832}}
|