obikmer/doc/references.bib

%% This BibTeX bibliography file was created using BibDesk.
%% https://bibdesk.sourceforge.io/

%% Created for Eric Coissac at 2026-04-18 08:19:36 +0200


%% Saved with string encoding Unicode (UTF-8)


@article{Zheng2020-ji,
	abstract = {MOTIVATION: Minimizers are methods to sample k-mers from a
               string, with the guarantee that similar set of k-mers will be
               chosen on similar strings. It is parameterized by the k-mer
               length k, a window length w and an order on the k-mers.
               Minimizers are used in a large number of softwares and pipelines
               to improve computation efficiency and decrease memory usage.
               Despite the method's popularity, many theoretical questions
               regarding its performance remain open. The core metric for
               measuring performance of a minimizer is the density, which
               measures the sparsity of sampled k-mers. The theoretical optimal
               density for a minimizer is 1/w, provably not achievable in
               general. For given k and w, little is known about asymptotically
               optimal minimizers, that is minimizers with density O(1/w).
               RESULTS: We derive a necessary and sufficient condition for
               existence of asymptotically optimal minimizers. We also provide a
               randomized algorithm, called the Miniception, to design
               minimizers with the best theoretical guarantee to date on density
               in practical scenarios. Constructing and using the Miniception is
               as easy as constructing and using a random minimizer, which
               allows the design of efficient minimizers that scale to the
               values of k and w used in current bioinformatics software
               programs. AVAILABILITY AND IMPLEMENTATION: Reference
               implementation of the Miniception and the codes for analysis can
               be found at https://github.com/kingsford-group/miniception.
               SUPPLEMENTARY INFORMATION: Supplementary data are available at
               Bioinformatics online.},
	author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
	doi = {10.1093/bioinformatics/btaa472},
	issn = {1367-4803,1367-4811},
	journal = {Bioinformatics (Oxford, England)},
	language = {en},
	month = jul,
	number = {Suppl_1},
	pages = {i119--i127},
	pmc = {PMC8248892},
	pmid = 32657376,
	publisher = {Oxford University Press (OUP)},
	title = {Improved design and analysis of practical minimizers},
	url = {http://dx.doi.org/10.1093/bioinformatics/btaa472},
	volume = 36,
	year = 2020,
	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btaa472}}

@article{Zheng2021-cc,
	abstract = {MOTIVATION: Minimizers are efficient methods to sample k-mers
               from genomic sequences that unconditionally preserve sufficiently
               long matches between sequences. Well-established methods to
               construct efficient minimizers focus on sampling fewer k-mers on
               a random sequence and use universal hitting sets (sets of k-mers
               that appear frequently enough) to upper bound the sketch size. In
               contrast, the problem of sequence-specific minimizers, which is
               to construct efficient minimizers to sample fewer k-mers on a
               specific sequence such as the reference genome, is less studied.
               Currently, the theoretical understanding of this problem is
               lacking, and existing methods do not specialize well to sketch
               specific sequences. RESULTS: We propose the concept of polar
               sets, complementary to the existing idea of universal hitting
               sets. Polar sets are k-mer sets that are spread out enough on the
               reference, and provably specialize well to specific sequences.
               Link energy measures how well spread out a polar set is, and with
               it, the sketch size can be bounded from above and below in a
               theoretically sound way. This allows for direct optimization of
               sketch size. We propose efficient heuristics to construct polar
               sets, and via experiments on the human reference genome, show
               their practical superiority in designing efficient
               sequence-specific minimizers. AVAILABILITY AND IMPLEMENTATION: A
               reference implementation and code for analyses under an
               open-source license are at
               https://github.com/kingsford-group/polarset. SUPPLEMENTARY
               INFORMATION: Supplementary data are available at Bioinformatics
               online.},
	author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
	doi = {10.1093/bioinformatics/btab313},
	issn = {1367-4803,1367-4811},
	journal = {Bioinformatics (Oxford, England)},
	language = {en},
	month = jul,
	number = {Suppl\_1},
	pages = {i187--i195},
	pmc = {PMC8686682},
	pmid = 34252928,
	publisher = {Oxford University Press (OUP)},
	title = {Sequence-specific minimizers via polar sets},
	url = {http://dx.doi.org/10.1093/bioinformatics/btab313},
	volume = 37,
	year = 2021,
	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btab313}}

@article{Pan2024-hb,
	abstract = {MOTIVATION: The minimizer concept is a data structure for
               sequence sketching. The standard canonical minimizer selects a
               subset of k-mers from the given DNA sequence by comparing the
               forward and reverse k-mers in a window simultaneously according
               to a predefined selection scheme. It is widely employed by
               sequence analysis such as read mapping and assembly. k-mer
               density, k-mer repetitiveness (e.g. k-mer bias), and
               computational efficiency are three critical measurements for
               minimizer selection schemes. However, there exist trade-offs
               between kinds of minimizer variants. Generic, effective, and
               efficient are always the requirements for high-performance
               minimizer algorithms. RESULTS: We propose a simple minimizer
               operator as a refinement of the standard canonical minimizer. It
               takes only a few operations to compute. However, it can improve
               the k-mer repetitiveness, especially for the lexicographic order.
               It applies to other selection schemes of total orders (e.g.
               random orders). Moreover, it is computationally efficient and the
               density is close to that of the standard minimizer. The refined
               minimizer may benefit high-performance applications like binning
               and read mapping. AVAILABILITY AND IMPLEMENTATION: The source
               code of the benchmark in this work is available at the github
               repository https://github.com/xp3i4/mini\_benchmark.},
	author = {Pan, Chenxu and Reinert, Knut},
	doi = {10.1093/bioinformatics/btae045},
	issn = {1367-4803,1367-4811},
	journal = {Bioinformatics (Oxford, England)},
	language = {en},
	month = feb,
	number = 2,
	pmc = {PMC10868324},
	pmid = 38269626,
	publisher = {Oxford University Press (OUP)},
	title = {A simple refined DNA minimizer operator enables 2-fold faster computation},
	url = {http://dx.doi.org/10.1093/bioinformatics/btae045},
	volume = 40,
	year = 2024,
	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btae045}}

@article{Kille2023-px,
	abstract = {MOTIVATION: The Jaccard similarity on k-mer sets has shown to be
               a convenient proxy for sequence identity. By avoiding expensive
               base-level alignments and comparing reduced sequence
               representations, tools such as MashMap can scale to massive
               numbers of pairwise comparisons while still providing useful
               similarity estimates. However, due to their reliance on minimizer
               winnowing, previous versions of MashMap were shown to be biased
               and inconsistent estimators of Jaccard similarity. This directly
               impacts downstream tools that rely on the accuracy of these
               estimates. RESULTS: To address this, we propose the minmer
               winnowing scheme, which generalizes the minimizer scheme by use
               of a rolling minhash with multiple sampled k-mers per window. We
               show both theoretically and empirically that minmers yield an
               unbiased estimator of local Jaccard similarity, and we implement
               this scheme in an updated version of MashMap. The minmer-based
               implementation is over 10 times faster than the minimizer-based
               version under the default ANI threshold, making it well-suited
               for large-scale comparative genomics applications. AVAILABILITY
               AND IMPLEMENTATION: MashMap3 is available at
               https://github.com/marbl/MashMap.},
	author = {Kille, Bryce and Garrison, Erik and Treangen, Todd J and Phillippy, Adam M},
	doi = {10.1093/bioinformatics/btad512},
	issn = {1367-4803,1367-4811},
	journal = {Bioinformatics (Oxford, England)},
	language = {en},
	month = sep,
	number = 9,
	pmc = {PMC10505501},
	pmid = 37603771,
	publisher = {Oxford University Press (OUP)},
	title = {Minmers are a generalization of minimizers that enable unbiased local Jaccard estimation},
	url = {http://dx.doi.org/10.1093/bioinformatics/btad512},
	volume = 39,
	year = 2023,
	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btad512}}

@incollection{Golan2025-xf,
	address = {Cham},
	author = {Golan, Shay and Shur, Arseny M},
	booktitle = {Lecture Notes in Computer Science},
	doi = {10.1007/978-3-031-82670-2\_25},
	isbn = {9783031826696,9783031826702},
	issn = {0302-9743,1611-3349},
	language = {en},
	pages = {347--360},
	publisher = {Springer Nature Switzerland},
	series = {Lecture Notes in Computer Science},
	title = {Expected density of random minimizers},
	url = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
	year = 2025,
	bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
	bdsk-url-2 = {http://dx.doi.org/10.1007/978-3-031-82670-2%5C_25}}

@article{Mohamadi2017-ok,
	abstract = {Motivation: Many bioinformatics algorithms are designed for the
               analysis of sequences of some uniform length, conventionally
               referred to as k -mers. These include de Bruijn graph assembly
               methods and sequence alignment tools. An efficient algorithm to
               enumerate the number of unique k -mers, or even better, to build
               a histogram of k -mer frequencies would be desirable for these
               tools and their downstream analysis pipelines. Among other
               applications, estimated frequencies can be used to predict genome
               sizes, measure sequencing error rates, and tune runtime
               parameters for analysis tools. However, calculating a k -mer
               histogram from large volumes of sequencing data is a challenging
               task. Results: Here, we present ntCard, a streaming algorithm for
               estimating the frequencies of k -mers in genomics datasets. At
               its core, ntCard uses the ntHash algorithm to efficiently compute
               hash values for streamed sequences. It then samples the
               calculated hash values to build a reduced representation
               multiplicity table describing the sample distribution. Finally,
               it uses a statistical model to reconstruct the population
               distribution from the sample distribution. We have compared the
               performance of ntCard and other cardinality estimation
               algorithms. We used three datasets of 480 GB, 500 GB and 2.4 TB
               in size, where the first two representing whole genome shotgun
               sequencing experiments on the human genome and the last one on
               the white spruce genome. Results show ntCard estimates k -mer
               coverage frequencies >15× faster than the state-of-the-art
               algorithms, using similar amount of memory, and with higher
               accuracy rates. Thus, our benchmarks demonstrate ntCard as a
               potentially enabling technology for large-scale genomics
               applications. Availability and Implementation: ntCard is written
               in C ++ and is released under the GPL license. It is freely
               available at https://github.com/bcgsc/ntCard. Contact:
               hmohamadi@bcgsc.ca or ibirol@bcgsc.ca. Supplementary information:
               Supplementary data are available at Bioinformatics online.},
	author = {Mohamadi, Hamid and Khan, Hamza and Birol, Inanc},
	date-modified = {2026-04-18 08:19:36 +0200},
	doi = {10.1093/bioinformatics/btw832},
	issn = {1367-4803,1367-4811},
	journal = {Bioinformatics (Oxford, England)},
	language = {en},
	month = may,
	number = 9,
	pages = {1324--1330},
	pmc = {PMC5408799},
	pmid = 28453674,
	publisher = {Oxford University Press (OUP)},
	title = {ntCard: a streaming algorithm for cardinality estimation in genomics data},
	url = {http://dx.doi.org/10.1093/bioinformatics/btw832},
	volume = 33,
	year = 2017,
	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btw832}}