%% This BibTeX bibliography file was created using BibDesk. %% https://bibdesk.sourceforge.io/ %% Created for Eric Coissac at 2026-04-18 08:19:36 +0200 %% Saved with string encoding Unicode (UTF-8) @article{Zheng2020-ji, abstract = {MOTIVATION: Minimizers are methods to sample k-mers from a string, with the guarantee that similar set of k-mers will be chosen on similar strings. It is parameterized by the k-mer length k, a window length w and an order on the k-mers. Minimizers are used in a large number of softwares and pipelines to improve computation efficiency and decrease memory usage. Despite the method's popularity, many theoretical questions regarding its performance remain open. The core metric for measuring performance of a minimizer is the density, which measures the sparsity of sampled k-mers. The theoretical optimal density for a minimizer is 1/w, provably not achievable in general. For given k and w, little is known about asymptotically optimal minimizers, that is minimizers with density O(1/w). RESULTS: We derive a necessary and sufficient condition for existence of asymptotically optimal minimizers. We also provide a randomized algorithm, called the Miniception, to design minimizers with the best theoretical guarantee to date on density in practical scenarios. Constructing and using the Miniception is as easy as constructing and using a random minimizer, which allows the design of efficient minimizers that scale to the values of k and w used in current bioinformatics software programs. AVAILABILITY AND IMPLEMENTATION: Reference implementation of the Miniception and the codes for analysis can be found at https://github.com/kingsford-group/miniception. SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.}, author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume}, doi = {10.1093/bioinformatics/btaa472}, issn = {1367-4803,1367-4811}, journal = {Bioinformatics (Oxford, England)}, language = {en}, month = jul, number = {Suppl_1}, pages = {i119--i127}, pmc = {PMC8248892}, pmid = 32657376, publisher = {Oxford University Press (OUP)}, title = {Improved design and analysis of practical minimizers}, url = {http://dx.doi.org/10.1093/bioinformatics/btaa472}, volume = 36, year = 2020, bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btaa472}} @article{Zheng2021-cc, abstract = {MOTIVATION: Minimizers are efficient methods to sample k-mers from genomic sequences that unconditionally preserve sufficiently long matches between sequences. Well-established methods to construct efficient minimizers focus on sampling fewer k-mers on a random sequence and use universal hitting sets (sets of k-mers that appear frequently enough) to upper bound the sketch size. In contrast, the problem of sequence-specific minimizers, which is to construct efficient minimizers to sample fewer k-mers on a specific sequence such as the reference genome, is less studied. Currently, the theoretical understanding of this problem is lacking, and existing methods do not specialize well to sketch specific sequences. RESULTS: We propose the concept of polar sets, complementary to the existing idea of universal hitting sets. Polar sets are k-mer sets that are spread out enough on the reference, and provably specialize well to specific sequences. Link energy measures how well spread out a polar set is, and with it, the sketch size can be bounded from above and below in a theoretically sound way. This allows for direct optimization of sketch size. We propose efficient heuristics to construct polar sets, and via experiments on the human reference genome, show their practical superiority in designing efficient sequence-specific minimizers. AVAILABILITY AND IMPLEMENTATION: A reference implementation and code for analyses under an open-source license are at https://github.com/kingsford-group/polarset. SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.}, author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume}, doi = {10.1093/bioinformatics/btab313}, issn = {1367-4803,1367-4811}, journal = {Bioinformatics (Oxford, England)}, language = {en}, month = jul, number = {Suppl\_1}, pages = {i187--i195}, pmc = {PMC8686682}, pmid = 34252928, publisher = {Oxford University Press (OUP)}, title = {Sequence-specific minimizers via polar sets}, url = {http://dx.doi.org/10.1093/bioinformatics/btab313}, volume = 37, year = 2021, bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btab313}} @article{Pan2024-hb, abstract = {MOTIVATION: The minimizer concept is a data structure for sequence sketching. The standard canonical minimizer selects a subset of k-mers from the given DNA sequence by comparing the forward and reverse k-mers in a window simultaneously according to a predefined selection scheme. It is widely employed by sequence analysis such as read mapping and assembly. k-mer density, k-mer repetitiveness (e.g. k-mer bias), and computational efficiency are three critical measurements for minimizer selection schemes. However, there exist trade-offs between kinds of minimizer variants. Generic, effective, and efficient are always the requirements for high-performance minimizer algorithms. RESULTS: We propose a simple minimizer operator as a refinement of the standard canonical minimizer. It takes only a few operations to compute. However, it can improve the k-mer repetitiveness, especially for the lexicographic order. It applies to other selection schemes of total orders (e.g. random orders). Moreover, it is computationally efficient and the density is close to that of the standard minimizer. The refined minimizer may benefit high-performance applications like binning and read mapping. AVAILABILITY AND IMPLEMENTATION: The source code of the benchmark in this work is available at the github repository https://github.com/xp3i4/mini\_benchmark.}, author = {Pan, Chenxu and Reinert, Knut}, doi = {10.1093/bioinformatics/btae045}, issn = {1367-4803,1367-4811}, journal = {Bioinformatics (Oxford, England)}, language = {en}, month = feb, number = 2, pmc = {PMC10868324}, pmid = 38269626, publisher = {Oxford University Press (OUP)}, title = {A simple refined DNA minimizer operator enables 2-fold faster computation}, url = {http://dx.doi.org/10.1093/bioinformatics/btae045}, volume = 40, year = 2024, bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btae045}} @article{Kille2023-px, abstract = {MOTIVATION: The Jaccard similarity on k-mer sets has shown to be a convenient proxy for sequence identity. By avoiding expensive base-level alignments and comparing reduced sequence representations, tools such as MashMap can scale to massive numbers of pairwise comparisons while still providing useful similarity estimates. However, due to their reliance on minimizer winnowing, previous versions of MashMap were shown to be biased and inconsistent estimators of Jaccard similarity. This directly impacts downstream tools that rely on the accuracy of these estimates. RESULTS: To address this, we propose the minmer winnowing scheme, which generalizes the minimizer scheme by use of a rolling minhash with multiple sampled k-mers per window. We show both theoretically and empirically that minmers yield an unbiased estimator of local Jaccard similarity, and we implement this scheme in an updated version of MashMap. The minmer-based implementation is over 10 times faster than the minimizer-based version under the default ANI threshold, making it well-suited for large-scale comparative genomics applications. AVAILABILITY AND IMPLEMENTATION: MashMap3 is available at https://github.com/marbl/MashMap.}, author = {Kille, Bryce and Garrison, Erik and Treangen, Todd J and Phillippy, Adam M}, doi = {10.1093/bioinformatics/btad512}, issn = {1367-4803,1367-4811}, journal = {Bioinformatics (Oxford, England)}, language = {en}, month = sep, number = 9, pmc = {PMC10505501}, pmid = 37603771, publisher = {Oxford University Press (OUP)}, title = {Minmers are a generalization of minimizers that enable unbiased local Jaccard estimation}, url = {http://dx.doi.org/10.1093/bioinformatics/btad512}, volume = 39, year = 2023, bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btad512}} @incollection{Golan2025-xf, address = {Cham}, author = {Golan, Shay and Shur, Arseny M}, booktitle = {Lecture Notes in Computer Science}, doi = {10.1007/978-3-031-82670-2\_25}, isbn = {9783031826696,9783031826702}, issn = {0302-9743,1611-3349}, language = {en}, pages = {347--360}, publisher = {Springer Nature Switzerland}, series = {Lecture Notes in Computer Science}, title = {Expected density of random minimizers}, url = {http://dx.doi.org/10.1007/978-3-031-82670-2_25}, year = 2025, bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-031-82670-2_25}, bdsk-url-2 = {http://dx.doi.org/10.1007/978-3-031-82670-2%5C_25}} @article{Mohamadi2017-ok, abstract = {Motivation: Many bioinformatics algorithms are designed for the analysis of sequences of some uniform length, conventionally referred to as k -mers. These include de Bruijn graph assembly methods and sequence alignment tools. An efficient algorithm to enumerate the number of unique k -mers, or even better, to build a histogram of k -mer frequencies would be desirable for these tools and their downstream analysis pipelines. Among other applications, estimated frequencies can be used to predict genome sizes, measure sequencing error rates, and tune runtime parameters for analysis tools. However, calculating a k -mer histogram from large volumes of sequencing data is a challenging task. Results: Here, we present ntCard, a streaming algorithm for estimating the frequencies of k -mers in genomics datasets. At its core, ntCard uses the ntHash algorithm to efficiently compute hash values for streamed sequences. It then samples the calculated hash values to build a reduced representation multiplicity table describing the sample distribution. Finally, it uses a statistical model to reconstruct the population distribution from the sample distribution. We have compared the performance of ntCard and other cardinality estimation algorithms. We used three datasets of 480 GB, 500 GB and 2.4 TB in size, where the first two representing whole genome shotgun sequencing experiments on the human genome and the last one on the white spruce genome. Results show ntCard estimates k -mer coverage frequencies >15× faster than the state-of-the-art algorithms, using similar amount of memory, and with higher accuracy rates. Thus, our benchmarks demonstrate ntCard as a potentially enabling technology for large-scale genomics applications. Availability and Implementation: ntCard is written in C ++ and is released under the GPL license. It is freely available at https://github.com/bcgsc/ntCard. Contact: hmohamadi@bcgsc.ca or ibirol@bcgsc.ca. Supplementary information: Supplementary data are available at Bioinformatics online.}, author = {Mohamadi, Hamid and Khan, Hamza and Birol, Inanc}, date-modified = {2026-04-18 08:19:36 +0200}, doi = {10.1093/bioinformatics/btw832}, issn = {1367-4803,1367-4811}, journal = {Bioinformatics (Oxford, England)}, language = {en}, month = may, number = 9, pages = {1324--1330}, pmc = {PMC5408799}, pmid = 28453674, publisher = {Oxford University Press (OUP)}, title = {ntCard: a streaming algorithm for cardinality estimation in genomics data}, url = {http://dx.doi.org/10.1093/bioinformatics/btw832}, volume = 33, year = 2017, bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btw832}}