first implementation but far to be optimal

2026-04-16 22:38:20 +02:00
commit de3f9b16cf
19336 changed files with 380276 additions and 0 deletions
@@ -0,0 +1,243 @@
+%% This BibTeX bibliography file was created using BibDesk.
+%% https://bibdesk.sourceforge.io/
+
+%% Created for Eric Coissac at 2026-04-18 08:19:36 +0200
+
+
+%% Saved with string encoding Unicode (UTF-8)
+
+
+
+@article{Zheng2020-ji,
+	abstract = {MOTIVATION: Minimizers are methods to sample k-mers from a
+               string, with the guarantee that similar set of k-mers will be
+               chosen on similar strings. It is parameterized by the k-mer
+               length k, a window length w and an order on the k-mers.
+               Minimizers are used in a large number of softwares and pipelines
+               to improve computation efficiency and decrease memory usage.
+               Despite the method's popularity, many theoretical questions
+               regarding its performance remain open. The core metric for
+               measuring performance of a minimizer is the density, which
+               measures the sparsity of sampled k-mers. The theoretical optimal
+               density for a minimizer is 1/w, provably not achievable in
+               general. For given k and w, little is known about asymptotically
+               optimal minimizers, that is minimizers with density O(1/w).
+               RESULTS: We derive a necessary and sufficient condition for
+               existence of asymptotically optimal minimizers. We also provide a
+               randomized algorithm, called the Miniception, to design
+               minimizers with the best theoretical guarantee to date on density
+               in practical scenarios. Constructing and using the Miniception is
+               as easy as constructing and using a random minimizer, which
+               allows the design of efficient minimizers that scale to the
+               values of k and w used in current bioinformatics software
+               programs. AVAILABILITY AND IMPLEMENTATION: Reference
+               implementation of the Miniception and the codes for analysis can
+               be found at https://github.com/kingsford-group/miniception.
+               SUPPLEMENTARY INFORMATION: Supplementary data are available at
+               Bioinformatics online.},
+	author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
+	doi = {10.1093/bioinformatics/btaa472},
+	issn = {1367-4803,1367-4811},
+	journal = {Bioinformatics (Oxford, England)},
+	language = {en},
+	month = jul,
+	number = {Suppl_1},
+	pages = {i119--i127},
+	pmc = {PMC8248892},
+	pmid = 32657376,
+	publisher = {Oxford University Press (OUP)},
+	title = {Improved design and analysis of practical minimizers},
+	url = {http://dx.doi.org/10.1093/bioinformatics/btaa472},
+	volume = 36,
+	year = 2020,
+	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btaa472}}
+
+@article{Zheng2021-cc,
+	abstract = {MOTIVATION: Minimizers are efficient methods to sample k-mers
+               from genomic sequences that unconditionally preserve sufficiently
+               long matches between sequences. Well-established methods to
+               construct efficient minimizers focus on sampling fewer k-mers on
+               a random sequence and use universal hitting sets (sets of k-mers
+               that appear frequently enough) to upper bound the sketch size. In
+               contrast, the problem of sequence-specific minimizers, which is
+               to construct efficient minimizers to sample fewer k-mers on a
+               specific sequence such as the reference genome, is less studied.
+               Currently, the theoretical understanding of this problem is
+               lacking, and existing methods do not specialize well to sketch
+               specific sequences. RESULTS: We propose the concept of polar
+               sets, complementary to the existing idea of universal hitting
+               sets. Polar sets are k-mer sets that are spread out enough on the
+               reference, and provably specialize well to specific sequences.
+               Link energy measures how well spread out a polar set is, and with
+               it, the sketch size can be bounded from above and below in a
+               theoretically sound way. This allows for direct optimization of
+               sketch size. We propose efficient heuristics to construct polar
+               sets, and via experiments on the human reference genome, show
+               their practical superiority in designing efficient
+               sequence-specific minimizers. AVAILABILITY AND IMPLEMENTATION: A
+               reference implementation and code for analyses under an
+               open-source license are at
+               https://github.com/kingsford-group/polarset. SUPPLEMENTARY
+               INFORMATION: Supplementary data are available at Bioinformatics
+               online.},
+	author = {Zheng, Hongyu and Kingsford, Carl and Mar{\c c}ais, Guillaume},
+	doi = {10.1093/bioinformatics/btab313},
+	issn = {1367-4803,1367-4811},
+	journal = {Bioinformatics (Oxford, England)},
+	language = {en},
+	month = jul,
+	number = {Suppl\_1},
+	pages = {i187--i195},
+	pmc = {PMC8686682},
+	pmid = 34252928,
+	publisher = {Oxford University Press (OUP)},
+	title = {Sequence-specific minimizers via polar sets},
+	url = {http://dx.doi.org/10.1093/bioinformatics/btab313},
+	volume = 37,
+	year = 2021,
+	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btab313}}
+
+@article{Pan2024-hb,
+	abstract = {MOTIVATION: The minimizer concept is a data structure for
+               sequence sketching. The standard canonical minimizer selects a
+               subset of k-mers from the given DNA sequence by comparing the
+               forward and reverse k-mers in a window simultaneously according
+               to a predefined selection scheme. It is widely employed by
+               sequence analysis such as read mapping and assembly. k-mer
+               density, k-mer repetitiveness (e.g. k-mer bias), and
+               computational efficiency are three critical measurements for
+               minimizer selection schemes. However, there exist trade-offs
+               between kinds of minimizer variants. Generic, effective, and
+               efficient are always the requirements for high-performance
+               minimizer algorithms. RESULTS: We propose a simple minimizer
+               operator as a refinement of the standard canonical minimizer. It
+               takes only a few operations to compute. However, it can improve
+               the k-mer repetitiveness, especially for the lexicographic order.
+               It applies to other selection schemes of total orders (e.g.
+               random orders). Moreover, it is computationally efficient and the
+               density is close to that of the standard minimizer. The refined
+               minimizer may benefit high-performance applications like binning
+               and read mapping. AVAILABILITY AND IMPLEMENTATION: The source
+               code of the benchmark in this work is available at the github
+               repository https://github.com/xp3i4/mini\_benchmark.},
+	author = {Pan, Chenxu and Reinert, Knut},
+	doi = {10.1093/bioinformatics/btae045},
+	issn = {1367-4803,1367-4811},
+	journal = {Bioinformatics (Oxford, England)},
+	language = {en},
+	month = feb,
+	number = 2,
+	pmc = {PMC10868324},
+	pmid = 38269626,
+	publisher = {Oxford University Press (OUP)},
+	title = {A simple refined DNA minimizer operator enables 2-fold faster computation},
+	url = {http://dx.doi.org/10.1093/bioinformatics/btae045},
+	volume = 40,
+	year = 2024,
+	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btae045}}
+
+@article{Kille2023-px,
+	abstract = {MOTIVATION: The Jaccard similarity on k-mer sets has shown to be
+               a convenient proxy for sequence identity. By avoiding expensive
+               base-level alignments and comparing reduced sequence
+               representations, tools such as MashMap can scale to massive
+               numbers of pairwise comparisons while still providing useful
+               similarity estimates. However, due to their reliance on minimizer
+               winnowing, previous versions of MashMap were shown to be biased
+               and inconsistent estimators of Jaccard similarity. This directly
+               impacts downstream tools that rely on the accuracy of these
+               estimates. RESULTS: To address this, we propose the minmer
+               winnowing scheme, which generalizes the minimizer scheme by use
+               of a rolling minhash with multiple sampled k-mers per window. We
+               show both theoretically and empirically that minmers yield an
+               unbiased estimator of local Jaccard similarity, and we implement
+               this scheme in an updated version of MashMap. The minmer-based
+               implementation is over 10 times faster than the minimizer-based
+               version under the default ANI threshold, making it well-suited
+               for large-scale comparative genomics applications. AVAILABILITY
+               AND IMPLEMENTATION: MashMap3 is available at
+               https://github.com/marbl/MashMap.},
+	author = {Kille, Bryce and Garrison, Erik and Treangen, Todd J and Phillippy, Adam M},
+	doi = {10.1093/bioinformatics/btad512},
+	issn = {1367-4803,1367-4811},
+	journal = {Bioinformatics (Oxford, England)},
+	language = {en},
+	month = sep,
+	number = 9,
+	pmc = {PMC10505501},
+	pmid = 37603771,
+	publisher = {Oxford University Press (OUP)},
+	title = {Minmers are a generalization of minimizers that enable unbiased local Jaccard estimation},
+	url = {http://dx.doi.org/10.1093/bioinformatics/btad512},
+	volume = 39,
+	year = 2023,
+	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btad512}}
+
+@incollection{Golan2025-xf,
+	address = {Cham},
+	author = {Golan, Shay and Shur, Arseny M},
+	booktitle = {Lecture Notes in Computer Science},
+	doi = {10.1007/978-3-031-82670-2\_25},
+	isbn = {9783031826696,9783031826702},
+	issn = {0302-9743,1611-3349},
+	language = {en},
+	pages = {347--360},
+	publisher = {Springer Nature Switzerland},
+	series = {Lecture Notes in Computer Science},
+	title = {Expected density of random minimizers},
+	url = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
+	year = 2025,
+	bdsk-url-1 = {http://dx.doi.org/10.1007/978-3-031-82670-2_25},
+	bdsk-url-2 = {http://dx.doi.org/10.1007/978-3-031-82670-2%5C_25}}
+
+@article{Mohamadi2017-ok,
+	abstract = {Motivation: Many bioinformatics algorithms are designed for the
+               analysis of sequences of some uniform length, conventionally
+               referred to as k -mers. These include de Bruijn graph assembly
+               methods and sequence alignment tools. An efficient algorithm to
+               enumerate the number of unique k -mers, or even better, to build
+               a histogram of k -mer frequencies would be desirable for these
+               tools and their downstream analysis pipelines. Among other
+               applications, estimated frequencies can be used to predict genome
+               sizes, measure sequencing error rates, and tune runtime
+               parameters for analysis tools. However, calculating a k -mer
+               histogram from large volumes of sequencing data is a challenging
+               task. Results: Here, we present ntCard, a streaming algorithm for
+               estimating the frequencies of k -mers in genomics datasets. At
+               its core, ntCard uses the ntHash algorithm to efficiently compute
+               hash values for streamed sequences. It then samples the
+               calculated hash values to build a reduced representation
+               multiplicity table describing the sample distribution. Finally,
+               it uses a statistical model to reconstruct the population
+               distribution from the sample distribution. We have compared the
+               performance of ntCard and other cardinality estimation
+               algorithms. We used three datasets of 480 GB, 500 GB and 2.4 TB
+               in size, where the first two representing whole genome shotgun
+               sequencing experiments on the human genome and the last one on
+               the white spruce genome. Results show ntCard estimates k -mer
+               coverage frequencies >15× faster than the state-of-the-art
+               algorithms, using similar amount of memory, and with higher
+               accuracy rates. Thus, our benchmarks demonstrate ntCard as a
+               potentially enabling technology for large-scale genomics
+               applications. Availability and Implementation: ntCard is written
+               in C ++ and is released under the GPL license. It is freely
+               available at https://github.com/bcgsc/ntCard. Contact:
+               hmohamadi@bcgsc.ca or ibirol@bcgsc.ca. Supplementary information:
+               Supplementary data are available at Bioinformatics online.},
+	author = {Mohamadi, Hamid and Khan, Hamza and Birol, Inanc},
+	date-modified = {2026-04-18 08:19:36 +0200},
+	doi = {10.1093/bioinformatics/btw832},
+	issn = {1367-4803,1367-4811},
+	journal = {Bioinformatics (Oxford, England)},
+	language = {en},
+	month = may,
+	number = 9,
+	pages = {1324--1330},
+	pmc = {PMC5408799},
+	pmid = 28453674,
+	publisher = {Oxford University Press (OUP)},
+	title = {ntCard: a streaming algorithm for cardinality estimation in genomics data},
+	url = {http://dx.doi.org/10.1093/bioinformatics/btw832},
+	volume = 33,
+	year = 2017,
+	bdsk-url-1 = {http://dx.doi.org/10.1093/bioinformatics/btw832}}