Files
obikmer/doc/references.bib
2026-04-19 12:17:16 +02:00

244 lines
13 KiB
BibTeX
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/
%% Created for Eric Coissac at 2026-04-18 08:19:36 +0200
%% Saved with string encoding Unicode (UTF-8)
@article{Zheng2020-ji,
abstract = {MOTIVATION: Minimizers are methods to sample k-mers from a
string, with the guarantee that similar set of k-mers will be
chosen on similar strings. It is parameterized by the k-mer
length k, a window length w and an order on the k-mers.
Minimizers are used in a large number of softwares and pipelines
to improve computation efficiency and decrease memory usage.
Despite the method's popularity, many theoretical questions
regarding its performance remain open. The core metric for
measuring performance of a minimizer is the density, which
measures the sparsity of sampled k-mers. The theoretical optimal
density for a minimizer is 1/w, provably not achievable in
general. For given k and w, little is known about asymptotically
optimal minimizers, that is minimizers with density O(1/w).
RESULTS: We derive a necessary and sufficient condition for
existence of asymptotically optimal minimizers. We also provide a
randomized algorithm, called the Miniception, to design
minimizers with the best theoretical guarantee to date on density
in practical scenarios. Constructing and using the Miniception is
as easy as constructing and using a random minimizer, which
allows the design of efficient minimizers that scale to the
values of k and w used in current bioinformatics software
programs. AVAILABILITY AND IMPLEMENTATION: Reference
implementation of the Miniception and the codes for analysis can
be found at https://github.com/kingsford-group/miniception.
SUPPLEMENTARY INFORMATION: Supplementary data are available at
Bioinformatics online.},
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
doi = {10.1093/bioinformatics/btaa472},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = jul,
number = {Suppl_1},
pages = {i119--i127},
pmc = {PMC8248892},
pmid = 32657376,
publisher = {Oxford University Press (OUP)},
title = {Improved design and analysis of practical minimizers},
url = {http://dx.doi.org/10.1093/bioinformatics/btaa472},
volume = 36,
year = 2020,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btaa472}}
@article{Zheng2021-cc,
abstract = {MOTIVATION: Minimizers are efficient methods to sample k-mers
from genomic sequences that unconditionally preserve sufficiently
long matches between sequences. Well-established methods to
construct efficient minimizers focus on sampling fewer k-mers on
a random sequence and use universal hitting sets (sets of k-mers
that appear frequently enough) to upper bound the sketch size. In
contrast, the problem of sequence-specific minimizers, which is
to construct efficient minimizers to sample fewer k-mers on a
specific sequence such as the reference genome, is less studied.
Currently, the theoretical understanding of this problem is
lacking, and existing methods do not specialize well to sketch
specific sequences. RESULTS: We propose the concept of polar
sets, complementary to the existing idea of universal hitting
sets. Polar sets are k-mer sets that are spread out enough on the
reference, and provably specialize well to specific sequences.
Link energy measures how well spread out a polar set is, and with
it, the sketch size can be bounded from above and below in a
theoretically sound way. This allows for direct optimization of
sketch size. We propose efficient heuristics to construct polar
sets, and via experiments on the human reference genome, show
their practical superiority in designing efficient
sequence-specific minimizers. AVAILABILITY AND IMPLEMENTATION: A
reference implementation and code for analyses under an
open-source license are at
https://github.com/kingsford-group/polarset. SUPPLEMENTARY
INFORMATION: Supplementary data are available at Bioinformatics
online.},
author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
doi = {10.1093/bioinformatics/btab313},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = jul,
number = {Suppl\_1},
pages = {i187--i195},
pmc = {PMC8686682},
pmid = 34252928,
publisher = {Oxford University Press (OUP)},
title = {Sequence-specific minimizers via polar sets},
url = {http://dx.doi.org/10.1093/bioinformatics/btab313},
volume = 37,
year = 2021,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btab313}}
@article{Pan2024-hb,
abstract = {MOTIVATION: The minimizer concept is a data structure for
sequence sketching. The standard canonical minimizer selects a
subset of k-mers from the given DNA sequence by comparing the
forward and reverse k-mers in a window simultaneously according
to a predefined selection scheme. It is widely employed by
sequence analysis such as read mapping and assembly. k-mer
density, k-mer repetitiveness (e.g. k-mer bias), and
computational efficiency are three critical measurements for
minimizer selection schemes. However, there exist trade-offs
between kinds of minimizer variants. Generic, effective, and
efficient are always the requirements for high-performance
minimizer algorithms. RESULTS: We propose a simple minimizer
operator as a refinement of the standard canonical minimizer. It
takes only a few operations to compute. However, it can improve
the k-mer repetitiveness, especially for the lexicographic order.
It applies to other selection schemes of total orders (e.g.
random orders). Moreover, it is computationally efficient and the
density is close to that of the standard minimizer. The refined
minimizer may benefit high-performance applications like binning
and read mapping. AVAILABILITY AND IMPLEMENTATION: The source
code of the benchmark in this work is available at the github
repository https://github.com/xp3i4/mini\_benchmark.},
author = {Pan, Chenxu and Reinert, Knut},
doi = {10.1093/bioinformatics/btae045},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = feb,
number = 2,
pmc = {PMC10868324},
pmid = 38269626,
publisher = {Oxford University Press (OUP)},
title = {A simple refined DNA minimizer operator enables 2-fold faster computation},
url = {http://dx.doi.org/10.1093/bioinformatics/btae045},
volume = 40,
year = 2024,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btae045}}
@article{Kille2023-px,
abstract = {MOTIVATION: The Jaccard similarity on k-mer sets has shown to be
a convenient proxy for sequence identity. By avoiding expensive
base-level alignments and comparing reduced sequence
representations, tools such as MashMap can scale to massive
numbers of pairwise comparisons while still providing useful
similarity estimates. However, due to their reliance on minimizer
winnowing, previous versions of MashMap were shown to be biased
and inconsistent estimators of Jaccard similarity. This directly
impacts downstream tools that rely on the accuracy of these
estimates. RESULTS: To address this, we propose the minmer
winnowing scheme, which generalizes the minimizer scheme by use
of a rolling minhash with multiple sampled k-mers per window. We
show both theoretically and empirically that minmers yield an
unbiased estimator of local Jaccard similarity, and we implement
this scheme in an updated version of MashMap. The minmer-based
implementation is over 10 times faster than the minimizer-based
version under the default ANI threshold, making it well-suited
for large-scale comparative genomics applications. AVAILABILITY
AND IMPLEMENTATION: MashMap3 is available at
https://github.com/marbl/MashMap.},
author = {Kille, Bryce and Garrison, Erik and Treangen, Todd J and Phillippy, Adam M},
doi = {10.1093/bioinformatics/btad512},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = sep,
number = 9,
pmc = {PMC10505501},
pmid = 37603771,
publisher = {Oxford University Press (OUP)},
title = {Minmers are a generalization of minimizers that enable unbiased local Jaccard estimation},
url = {http://dx.doi.org/10.1093/bioinformatics/btad512},
volume = 39,
year = 2023,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btad512}}
@incollection{Golan2025-xf,
address = {Cham},
author = {Golan, Shay and Shur, Arseny M},
booktitle = {Lecture Notes in Computer Science},
doi = {10.1007/978-3-031-82670-2\_25},
isbn = {9783031826696,9783031826702},
issn = {0302-9743,1611-3349},
language = {en},
pages = {347--360},
publisher = {Springer Nature Switzerland},
series = {Lecture Notes in Computer Science},
title = {Expected density of random minimizers},
url = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
year = 2025,
bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
bdsk-url-2 = {http://dx.doi.org/10.1007/978-3-031-82670-2%5C_25}}
@article{Mohamadi2017-ok,
abstract = {Motivation: Many bioinformatics algorithms are designed for the
analysis of sequences of some uniform length, conventionally
referred to as k -mers. These include de Bruijn graph assembly
methods and sequence alignment tools. An efficient algorithm to
enumerate the number of unique k -mers, or even better, to build
a histogram of k -mer frequencies would be desirable for these
tools and their downstream analysis pipelines. Among other
applications, estimated frequencies can be used to predict genome
sizes, measure sequencing error rates, and tune runtime
parameters for analysis tools. However, calculating a k -mer
histogram from large volumes of sequencing data is a challenging
task. Results: Here, we present ntCard, a streaming algorithm for
estimating the frequencies of k -mers in genomics datasets. At
its core, ntCard uses the ntHash algorithm to efficiently compute
hash values for streamed sequences. It then samples the
calculated hash values to build a reduced representation
multiplicity table describing the sample distribution. Finally,
it uses a statistical model to reconstruct the population
distribution from the sample distribution. We have compared the
performance of ntCard and other cardinality estimation
algorithms. We used three datasets of 480 GB, 500 GB and 2.4 TB
in size, where the first two representing whole genome shotgun
sequencing experiments on the human genome and the last one on
the white spruce genome. Results show ntCard estimates k -mer
coverage frequencies >15× faster than the state-of-the-art
algorithms, using similar amount of memory, and with higher
accuracy rates. Thus, our benchmarks demonstrate ntCard as a
potentially enabling technology for large-scale genomics
applications. Availability and Implementation: ntCard is written
in C ++ and is released under the GPL license. It is freely
available at https://github.com/bcgsc/ntCard. Contact:
hmohamadi@bcgsc.ca or ibirol@bcgsc.ca. Supplementary information:
Supplementary data are available at Bioinformatics online.},
author = {Mohamadi, Hamid and Khan, Hamza and Birol, Inanc},
date-modified = {2026-04-18 08:19:36 +0200},
doi = {10.1093/bioinformatics/btw832},
issn = {1367-4803,1367-4811},
journal = {Bioinformatics (Oxford, England)},
language = {en},
month = may,
number = 9,
pages = {1324--1330},
pmc = {PMC5408799},
pmid = 28453674,
publisher = {Oxford University Press (OUP)},
title = {ntCard: a streaming algorithm for cardinality estimation in genomics data},
url = {http://dx.doi.org/10.1093/bioinformatics/btw832},
volume = 33,
year = 2017,
bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btw832}}