doc/theory/entropy/index.html


<!doctype html>
<html lang="en" class="no-js">
  <head>
    
      <meta charset="utf-8">
      <meta name="viewport" content="width=device-width,initial-scale=1">
      
      
        <link rel="prev" href="../encoding/">
      
      
        <link rel="next" href="../minimizer/">
      
      
      <link rel="icon" href="../../assets/images/favicon.png">
      <meta name="generator" content="mkdocs-1.6.1, mkdocs-material-9.7.6">
    
    
        <title>Entropy filter - obikmer</title>
      
    
      <link rel="stylesheet" href="../../assets/stylesheets/main.484c7ddc.min.css">
      
      
        <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
        <link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,300i,400,400i,700,700i%7CRoboto+Mono:400,400i,700,700i&display=fallback">
        <style>:root{--md-text-font:"Roboto";--md-code-font:"Roboto Mono"}</style>
      
    
    <script>__md_scope=new URL("../..",location),__md_hash=e=>[...e].reduce(((e,_)=>(e<<5)-e+_.charCodeAt(0)),0),__md_get=(e,_=localStorage,t=__md_scope)=>JSON.parse(_.getItem(t.pathname+"."+e)),__md_set=(e,_,t=localStorage,a=__md_scope)=>{try{t.setItem(a.pathname+"."+e,JSON.stringify(_))}catch(e){}}</script>
    
      
  </head>
  
  
    <body dir="ltr">
  
    
    <input class="md-toggle" data-md-toggle="drawer" type="checkbox" id="__drawer" autocomplete="off">
    <input class="md-toggle" data-md-toggle="search" type="checkbox" id="__search" autocomplete="off">
    <label class="md-overlay" for="__drawer"></label>
    <div data-md-component="skip">
      
        
        <a href="#kmer-entropy-filter" class="md-skip">
          Skip to content
        </a>
      
    </div>
    <div data-md-component="announce">
      
    </div>
    
    
<header class="md-header md-header--shadow" data-md-component="header">
  <nav class="md-header__inner md-grid" aria-label="Header">
    <a href="../.." title="obikmer" class="md-header__button md-logo" aria-label="obikmer" data-md-component="logo">
      
  
  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>

    </a>
    <label class="md-header__button md-icon" for="__drawer">
      
      <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z"/></svg>
    </label>
    <div class="md-header__title" data-md-component="header-title">
      <div class="md-header__ellipsis">
        <div class="md-header__topic">
          <span class="md-ellipsis">
            obikmer
          </span>
        </div>
        <div class="md-header__topic" data-md-component="header-topic">
          <span class="md-ellipsis">
            
              Entropy filter
            
          </span>
        </div>
      </div>
    </div>
    
    
      <script>var palette=__md_get("__palette");if(palette&&palette.color){if("(prefers-color-scheme)"===palette.color.media){var media=matchMedia("(prefers-color-scheme: light)"),input=document.querySelector(media.matches?"[data-md-color-media='(prefers-color-scheme: light)']":"[data-md-color-media='(prefers-color-scheme: dark)']");palette.color.media=input.getAttribute("data-md-color-media"),palette.color.scheme=input.getAttribute("data-md-color-scheme"),palette.color.primary=input.getAttribute("data-md-color-primary"),palette.color.accent=input.getAttribute("data-md-color-accent")}for(var[key,value]of Object.entries(palette.color))document.body.setAttribute("data-md-color-"+key,value)}</script>
    
    
  </nav>
  
</header>
    
    <div class="md-container" data-md-component="container">
      
      
      <main class="md-main" data-md-component="main">
        <div class="md-main__inner md-grid">
          
            
              <div class="md-sidebar md-sidebar--primary" data-md-component="sidebar" data-md-type="navigation" >
                <div class="md-sidebar__scrollwrap">
                  <div class="md-sidebar__inner">
                    

<nav class="md-nav md-nav--primary" aria-label="Navigation" data-md-level="0">
  <label class="md-nav__title" for="__drawer">
    <a href="../.." title="obikmer" class="md-nav__button md-logo" aria-label="obikmer" data-md-component="logo">
      
  
  <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M12 8a3 3 0 0 0 3-3 3 3 0 0 0-3-3 3 3 0 0 0-3 3 3 3 0 0 0 3 3m0 3.54C9.64 9.35 6.5 8 3 8v11c3.5 0 6.64 1.35 9 3.54 2.36-2.19 5.5-3.54 9-3.54V8c-3.5 0-6.64 1.35-9 3.54"/></svg>

    </a>
    obikmer
  </label>
  
  <ul class="md-nav__list" data-md-scrollfix>
    
      
    <li class="md-nav__item">
      <a href="../.." class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Home
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item md-nav__item--active md-nav__item--nested">
      
        
        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_2" checked>
        
          
          <label class="md-nav__link" for="__nav_2" id="__nav_2_label" tabindex="0">
            
  
  <span class="md-ellipsis">
    
  
    Theory
  

  </span>
  
  
            <span class="md-nav__icon md-icon"></span>
          </label>
        
        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_2_label" aria-expanded="true">
          <label class="md-nav__title" for="__nav_2">
            <span class="md-nav__icon md-icon"></span>
            
  
    Theory
  

          </label>
          <ul class="md-nav__list" data-md-scrollfix>
            
              
    <li class="md-nav__item">
      <a href="../../kmers/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Kmers and super-kmers
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../encoding/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    DNA encoding
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item md-nav__item--active">
      
      <input class="md-nav__toggle md-toggle" type="checkbox" id="__toc">
      
      
        <label class="md-nav__link md-nav__link--active" for="__toc">
          
  
  <span class="md-ellipsis">
    
  
    Entropy filter
  

  </span>
  
  
          <span class="md-nav__icon md-icon"></span>
        </label>
      
      <a href="./" class="md-nav__link md-nav__link--active">
        
  
  <span class="md-ellipsis">
    
  
    Entropy filter
  

  </span>
  
  
      </a>
      
        
<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
  
  
    <label class="md-nav__title" for="__toc">
      <span class="md-nav__icon md-icon"></span>
      Table of contents
    </label>
    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
      
        <li class="md-nav__item">
  <a href="#sub-word-frequencies" class="md-nav__link">
    <span class="md-ellipsis">
      
        Sub-word frequencies
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#corrected-shannon-entropy" class="md-nav__link">
    <span class="md-ellipsis">
      
        Corrected Shannon entropy
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#maximum-entropy-correction-for-small-samples" class="md-nav__link">
    <span class="md-ellipsis">
      
        Maximum entropy correction for small samples
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#normalized-entropy" class="md-nav__link">
    <span class="md-ellipsis">
      
        Normalized entropy
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#final-score" class="md-nav__link">
    <span class="md-ellipsis">
      
        Final score
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#interpretation-as-an-effective-number-of-classes" class="md-nav__link">
    <span class="md-ellipsis">
      
        Interpretation as an effective number of classes
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#properties" class="md-nav__link">
    <span class="md-ellipsis">
      
        Properties
      
    </span>
  </a>
  
</li>
      
    </ul>
  
</nav>
      
    </li>
  

    <li class="md-nav__item">
      <a href="../minimizer/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Minimizer selection
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../indexing/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Partitioning architecture
  

  </span>
  
  
      </a>
    </li>
  

          </ul>
        </nav>
      
    </li>
  

    <li class="md-nav__item md-nav__item--nested">
      
        
        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_3" >
        
          
          <label class="md-nav__link" for="__nav_3" id="__nav_3_label" tabindex="0">
            
  
  <span class="md-ellipsis">
    
  
    Implementation
  

  </span>
  
  
            <span class="md-nav__icon md-icon"></span>
          </label>
        
        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_3_label" aria-expanded="false">
          <label class="md-nav__title" for="__nav_3">
            <span class="md-nav__icon md-icon"></span>
            
  
    Implementation
  

          </label>
          <ul class="md-nav__list" data-md-scrollfix>
            
              
    <li class="md-nav__item">
      <a href="../../implementation/superkmer/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    SuperKmer
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/kmer/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Kmer
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/chunkreader/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Chunk reader
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/pipeline/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Construction pipeline
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/obipipeline/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    obipipeline library
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/storage/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    On-disk storage
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/mphf/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    MPHF selection
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/unitig_evidence/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Unitig evidence encoding
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/obilayeredmap/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    obilayeredmap crate
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/persistent_compact_int_vec/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    PersistentCompactIntVec
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../implementation/persistent_bit_vec/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    PersistentBitVec
  

  </span>
  
  
      </a>
    </li>
  

          </ul>
        </nav>
      
    </li>
  

    <li class="md-nav__item md-nav__item--nested">
      
        
        <input class="md-nav__toggle md-toggle " type="checkbox" id="__nav_4" >
        
          
          <label class="md-nav__link" for="__nav_4" id="__nav_4_label" tabindex="0">
            
  
  <span class="md-ellipsis">
    
  
    Architecture
  

  </span>
  
  
            <span class="md-nav__icon md-icon"></span>
          </label>
        
        <nav class="md-nav" data-md-level="1" aria-labelledby="__nav_4_label" aria-expanded="false">
          <label class="md-nav__title" for="__nav_4">
            <span class="md-nav__icon md-icon"></span>
            
  
    Architecture
  

          </label>
          <ul class="md-nav__list" data-md-scrollfix>
            
              
    <li class="md-nav__item">
      <a href="../../architecture/sequences/invariant/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Sequences
  

  </span>
  
  
      </a>
    </li>
  

    <li class="md-nav__item">
      <a href="../../architecture/index_architecture/" class="md-nav__link">
        
  
  <span class="md-ellipsis">
    
  
    Kmer index
  

  </span>
  
  
      </a>
    </li>
  

          </ul>
        </nav>
      
    </li>
  

  </ul>
</nav>
                  </div>
                </div>
              </div>
            
            
              <div class="md-sidebar md-sidebar--secondary" data-md-component="sidebar" data-md-type="toc" >
                <div class="md-sidebar__scrollwrap">
                  <div class="md-sidebar__inner">
                    

<nav class="md-nav md-nav--secondary" aria-label="Table of contents">
  
  
    <label class="md-nav__title" for="__toc">
      <span class="md-nav__icon md-icon"></span>
      Table of contents
    </label>
    <ul class="md-nav__list" data-md-component="toc" data-md-scrollfix>
      
        <li class="md-nav__item">
  <a href="#sub-word-frequencies" class="md-nav__link">
    <span class="md-ellipsis">
      
        Sub-word frequencies
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#corrected-shannon-entropy" class="md-nav__link">
    <span class="md-ellipsis">
      
        Corrected Shannon entropy
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#maximum-entropy-correction-for-small-samples" class="md-nav__link">
    <span class="md-ellipsis">
      
        Maximum entropy correction for small samples
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#normalized-entropy" class="md-nav__link">
    <span class="md-ellipsis">
      
        Normalized entropy
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#final-score" class="md-nav__link">
    <span class="md-ellipsis">
      
        Final score
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#interpretation-as-an-effective-number-of-classes" class="md-nav__link">
    <span class="md-ellipsis">
      
        Interpretation as an effective number of classes
      
    </span>
  </a>
  
</li>
      
        <li class="md-nav__item">
  <a href="#properties" class="md-nav__link">
    <span class="md-ellipsis">
      
        Properties
      
    </span>
  </a>
  
</li>
      
    </ul>
  
</nav>
                  </div>
                </div>
              </div>
            
          
            <div class="md-content" data-md-component="content">
              
              <article class="md-content__inner md-typeset">
                
                  
<h1 id="kmer-entropy-filter">Kmer entropy filter</h1>
<p>Low-complexity kmers (polyA, polyT, tandem repeats) are detected and excluded during phase 1. The filter computes a <strong>normalized Shannon entropy</strong> over sub-words of multiple sizes, corrected for two sources of bias: the small number of observations within a single kmer, and the unequal sizes of circular equivalence classes.</p>
<h2 id="sub-word-frequencies">Sub-word frequencies</h2>
<p>For a kmer of length k and a sub-word size ws (1 ≤ ws ≤ ws_max, typically ws_max = 6), extract the <span class="arithmatex">\(n_{\text{words}} = k - ws + 1\)</span> overlapping sub-words by sliding a window of length ws:</p>
<div class="arithmatex">\[w_i = \text{kmer}[i \mathinner{..} i+ws-1], \quad i = 0, \ldots, n_{\text{words}}-1\]</div>
<p>Each sub-word is mapped to its <strong>circular canonical form</strong>: the lexicographic minimum among all cyclic rotations of the word <strong>and all cyclic rotations of its reverse complement</strong>. This extended equivalence relation ensures that entropy(K) = entropy(revcomp(K)) — the filter is strand-symmetric. Let <span class="arithmatex">\(s_j\)</span> be the size of equivalence class <span class="arithmatex">\(j\)</span> (number of distinct raw words mapping to canonical form <span class="arithmatex">\(j\)</span>), and <span class="arithmatex">\(f_j\)</span> the count of canonical form <span class="arithmatex">\(j\)</span> among the <span class="arithmatex">\(n_{\text{words}}\)</span> sub-words (<span class="arithmatex">\(\sum_j f_j = n_{\text{words}}\)</span>).</p>
<h2 id="corrected-shannon-entropy">Corrected Shannon entropy</h2>
<p>The circular equivalence classes have unequal sizes: under a uniform distribution over all <span class="arithmatex">\(4^{ws}\)</span> raw words, class <span class="arithmatex">\(j\)</span> is visited with probability <span class="arithmatex">\(s_j / 4^{ws}\)</span>, not <span class="arithmatex">\(1/n_a\)</span>. Computing entropy directly over canonical classes therefore underestimates the entropy of a random sequence.</p>
<p>The correction "unfolds" each canonical class back to its member raw words, redistributing each observation of class <span class="arithmatex">\(j\)</span> equally among its <span class="arithmatex">\(s_j\)</span> members:</p>
<div class="arithmatex">\[H_{\text{corr}} = \log(n_{\text{words}}) - \frac{1}{n_{\text{words}}} \sum_j f_j \log f_j + \frac{1}{n_{\text{words}}} \sum_j f_j \log s_j\]</div>
<p>The last term is the correction for unequal class sizes. For a uniformly random sequence (<span class="arithmatex">\(f_j \approx n_{\text{words}} \cdot s_j / 4^{ws}\)</span>), this gives <span class="arithmatex">\(H_{\text{corr}} \approx \log(4^{ws}) = 2 \cdot ws \cdot \log 2\)</span>, the maximum entropy over raw words.</p>
<h2 id="maximum-entropy-correction-for-small-samples">Maximum entropy correction for small samples</h2>
<p>With only <span class="arithmatex">\(n_{\text{words}}\)</span> observations over <span class="arithmatex">\(4^{ws}\)</span> possible raw words, the achievable maximum entropy is bounded by the most uniform integer distribution over <span class="arithmatex">\(4^{ws}\)</span> categories.</p>
<p>Let <span class="arithmatex">\(c = \lfloor n_{\text{words}} / 4^{ws} \rfloor\)</span> and <span class="arithmatex">\(r = n_{\text{words}} \bmod 4^{ws}\)</span>. The most uniform integer distribution assigns frequency <span class="arithmatex">\(c+1\)</span> to <span class="arithmatex">\(r\)</span> categories and <span class="arithmatex">\(c\)</span> to the remaining <span class="arithmatex">\(4^{ws} - r\)</span>, with the convention <span class="arithmatex">\(0 \log 0 = 0\)</span>:</p>
<div class="arithmatex">\[H_{\max} = -\left[(4^{ws} - r)\,\frac{c}{n_{\text{words}}}\log\frac{c}{n_{\text{words}}} + r\,\frac{c+1}{n_{\text{words}}}\log\frac{c+1}{n_{\text{words}}}\right]\]</div>
<p>When <span class="arithmatex">\(n_{\text{words}} &lt; 4^{ws}\)</span>: <span class="arithmatex">\(c=0\)</span>, <span class="arithmatex">\(r=n_{\text{words}}\)</span>, and the formula reduces to <span class="arithmatex">\(H_{\max} = \log(n_{\text{words}})\)</span> — a single unified expression covers both regimes. A truly random sequence achieves <span class="arithmatex">\(H_{\text{corr}} \approx H_{\max}\)</span>.</p>
<h2 id="normalized-entropy">Normalized entropy</h2>
<div class="arithmatex">\[\hat{H}(ws) = \frac{H_{\text{corr}}}{H_{\max}} \in [0, 1]\]</div>
<h2 id="final-score">Final score</h2>
<p>The filter computes <span class="arithmatex">\(\hat{H}(ws)\)</span> for each word size ws from 1 to ws_max and returns the <strong>minimum</strong>:</p>
<div class="arithmatex">\[\text{entropy}(kmer) = \min_{ws=1}^{ws_{\max}} \hat{H}(ws)\]</div>
<p>A value near 0 indicates low complexity (e.g. AAAA…); near 1 indicates high complexity. A kmer is rejected if <span class="arithmatex">\(\text{entropy}(kmer) \leq \theta\)</span>, where <span class="arithmatex">\(\theta\)</span> is a collection parameter. The minimum across word sizes ensures that any scale of repetition is detected independently: polyA is caught at ws=1, dinucleotide repeats at ws=2, etc.</p>
<h2 id="interpretation-as-an-effective-number-of-classes">Interpretation as an effective number of classes</h2>
<p><span class="arithmatex">\(H_{\text{corr}}\)</span> is a standard Shannon entropy over raw words (after unfolding the equivalence classes), so the classical perplexity interpretation holds directly: <span class="arithmatex">\(N_{\text{eff}} = e^{H_{\text{corr}}}\)</span> is the number of equiprobable classes that would yield the same entropy.</p>
<p>For the normalised score <span class="arithmatex">\(\hat{H}\)</span>, dividing by <span class="arithmatex">\(H_{\text{max}}\)</span> changes the logarithm base:</p>
<div class="arithmatex">\[\hat{H} = \frac{\log N_{\text{eff}}}{\log N_{\text{max}}} = \log_{N_{\text{max}}} N_{\text{eff}} \quad \Longleftrightarrow \quad N_{\text{eff}} = N_{\text{max}}^{\,\hat{H}}\]</div>
<p>The property is preserved: <span class="arithmatex">\(\hat{H}\)</span> is the logarithm (in base <span class="arithmatex">\(N_{\text{max}}\)</span>) of the effective number of equi-represented classes.</p>
<p>In the large-sample limit (<span class="arithmatex">\(n_{\text{words}} \gg 4^{ws}\)</span>), <span class="arithmatex">\(N_{\text{max}} \approx 4^{ws}\)</span>, giving:</p>
<div class="arithmatex">\[N_{\text{eff}} \approx 4^{ws \cdot \hat{H}}\]</div>
<p>This has a clean interpretation: <span class="arithmatex">\(ws \cdot \hat{H}\)</span> is the <strong>effective word length</strong> (in bases) of a perfectly uniform distribution that would produce the same entropy. At <span class="arithmatex">\(\hat{H} = 1\)</span> the full space of <span class="arithmatex">\(4^{ws}\)</span> words is used; at <span class="arithmatex">\(\hat{H} = 0.5\)</span> with ws=2, only <span class="arithmatex">\(4^1 = 4\)</span> effective classes out of 16 are occupied.</p>
<p>In our actual regime, <span class="arithmatex">\(n_{\text{words}}\)</span> is small and <span class="arithmatex">\(4^{ws}\)</span> can exceed <span class="arithmatex">\(n_{\text{words}}\)</span>, so <span class="arithmatex">\(H_{\text{max}} &lt; \log(4^{ws})\)</span> due to the small-sample correction. The exact effective count is <span class="arithmatex">\(N_{\text{max}}^{\hat{H}}\)</span>, not <span class="arithmatex">\(4^{ws \cdot \hat{H}}\)</span>.</p>
<h2 id="properties">Properties</h2>
<p>The entropy score is a function of the kmer sequence alone — it does not depend on the surrounding context or on the position within any genome. Two consequences:</p>
<ul>
<li><strong>Orientation invariance</strong>: <span class="arithmatex">\(\text{entropy}(K) = \text{entropy}(\text{revcomp}(K))\)</span>, guaranteed by the strand-symmetric canonical form.</li>
<li><strong>Context independence</strong>: the same kmer is always rejected or always kept, regardless of which genome it occurs in, where in that genome it appears, or which strand is considered. The filter defines a fixed partition of the kmer space into low-complexity and valid kmers.</li>
</ul>


              </article>
            </div>
          
          
<script>var target=document.getElementById(location.hash.slice(1));target&&target.name&&(target.checked=target.name.startsWith("__tabbed_"))</script>
        </div>
        
      </main>
      
        <footer class="md-footer">
  
  <div class="md-footer-meta md-typeset">
    <div class="md-footer-meta__inner md-grid">
      <div class="md-copyright">
  
  
    Made with
    <a href="https://squidfunk.github.io/mkdocs-material/" target="_blank" rel="noopener">
      Material for MkDocs
    </a>
  
</div>
      
    </div>
  </div>
</footer>
      
    </div>
    <div class="md-dialog" data-md-component="dialog">
      <div class="md-dialog__inner md-typeset"></div>
    </div>
    
    
      <script id="__config" type="application/json">{"annotate": null, "base": "../..", "features": [], "search": "../../assets/javascripts/workers/search.2c215733.min.js", "tags": null, "translations": {"clipboard.copied": "Copied to clipboard", "clipboard.copy": "Copy to clipboard", "search.result.more.one": "1 more on this page", "search.result.more.other": "# more on this page", "search.result.none": "No matching documents", "search.result.one": "1 matching document", "search.result.other": "# matching documents", "search.result.placeholder": "Type to start searching", "search.result.term.missing": "Missing", "select.version": "Select version"}, "version": null}</script>
    
    
      <script src="../../assets/javascripts/bundle.79ae519e.min.js"></script>
      
        <script src="https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js"></script>
      
    
  </body>
</html>