OBIJupyterHub/jupyterhub_volumes/course/data/Wolf_diet/wolf.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d74a3346-9e41-42db-8134-6cc3e697dafc",
   "metadata": {},
   "source": [
    "# Wolf diet analysis\n",
    "\n",
    "## Preparing the reference database\n",
    "\n",
    "We'll use a small version of *Genbank* containing only mammal sequences shorter than 20kb.\n",
    "\n",
    "```bash\n",
    "obigrep -L 20000 -Z \\\n",
    "        ../course/data/Genbank/Release_264 \\\n",
    "        > gb264_small_mam.fasta.gz\n",
    "```\n",
    "\n",
    "On that small DB I'll run `obipcr`.\n",
    "\n",
    "Vertebrate primers:\n",
    "\n",
    "- forward: TTAGATACCCCACTATGC\n",
    "- reverse: TAGAACAGGCTCCTCTAG\n",
    "\n",
    "We'll allow for 4 mismatches at most on each primer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c02aea46-81db-4131-a1c5-7cc4ede91636",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 32                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] ../course/data/Genbank/Release_264/small.fasta.gz mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n"
     ]
    }
   ],
   "source": [
    "obipcr --forward TTAGATACCCCACTATGC \\\n",
    "       --reverse TAGAACAGGCTCCTCTAG \\\n",
    "       -L 200 -e 4 -Z \\\n",
    "       --no-progressbar \\\n",
    "       ../course/data/Genbank/Release_264/small.fasta.gz \\\n",
    "       > vert01_raw_db.fasta.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3749e102-9e4a-4d33-8ec3-34fb08db6bca",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] vert01_raw_db.fasta.gz mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Running dereplication on disk with 100 chunks \n",
      "\u001b[36mINFO\u001b[0m[0000] Keep sigletons in the output                 \n",
      "\u001b[36mINFO\u001b[0m[0000] Starting data splitting                      \n",
      "                                                - Reading sequences (20678/-, 72999 it/s) [0s] - Splitting data set (22774/-, 78503 it/s) [0s] \n",
      "\n",
      "\u001b[36mINFO\u001b[0m[0000] Data splitted over 100 batches               \n",
      "\u001b[36mINFO\u001b[0m[0000] End of the data splitting                    \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_0.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 0/100 : 108 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_1.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 1/100 : 105 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_10.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 2/100 : 83 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_11.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 3/100 : 100 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_12.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 4/100 : 81 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_13.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 5/100 : 322 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_14.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 6/100 : 1022 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_15.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 7/100 : 156 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_16.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 8/100 : 304 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_17.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 9/100 : 368 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_18.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 10/100 : 148 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_19.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 11/100 : 526 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_2.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 12/100 : 185 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_20.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 13/100 : 110 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_21.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 14/100 : 115 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_22.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 15/100 : 305 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_23.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 16/100 : 61 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_24.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 17/100 : 215 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_25.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 18/100 : 228 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_26.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 19/100 : 112 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_27.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 20/100 : 642 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_28.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 21/100 : 167 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_29.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 22/100 : 573 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_3.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 23/100 : 60 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_30.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 24/100 : 592 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_31.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 25/100 : 215 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_32.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 26/100 : 183 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_33.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 27/100 : 93 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_34.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 28/100 : 129 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_35.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 29/100 : 173 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_36.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 30/100 : 160 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_37.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 31/100 : 127 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_38.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 32/100 : 108 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_39.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 33/100 : 200 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_4.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 34/100 : 112 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_40.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 35/100 : 167 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_41.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 36/100 : 249 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_42.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 37/100 : 138 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_43.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 38/100 : 235 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_44.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 39/100 : 135 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_45.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 40/100 : 451 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_46.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 41/100 : 87 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_47.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 42/100 : 177 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_48.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 43/100 : 166 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_49.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 44/100 : 75 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_5.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 45/100 : 329 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_50.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 46/100 : 811 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_51.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 47/100 : 298 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_52.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 48/100 : 128 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_53.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 49/100 : 70 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_54.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 50/100 : 47 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_55.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 51/100 : 241 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_56.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 52/100 : 167 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_57.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 53/100 : 136 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_58.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 54/100 : 261 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_59.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 55/100 : 265 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_6.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 56/100 : 146 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_60.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 57/100 : 244 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_61.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 58/100 : 153 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_62.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 59/100 : 220 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_63.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 60/100 : 299 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_64.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 61/100 : 850 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_65.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 62/100 : 147 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_66.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 63/100 : 132 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_67.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 64/100 : 216 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_68.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 65/100 : 105 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_69.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 66/100 : 857 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_7.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 67/100 : 87 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_70.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 68/100 : 180 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_71.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 69/100 : 181 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_72.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 70/100 : 94 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_73.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 71/100 : 98 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_74.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 72/100 : 49 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_75.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 73/100 : 264 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_76.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 74/100 : 367 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_77.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 75/100 : 133 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_78.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 76/100 : 88 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_79.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 77/100 : 1445 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_8.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 78/100 : 1718 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_80.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 79/100 : 128 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_81.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 80/100 : 146 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_82.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 81/100 : 108 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_83.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 82/100 : 100 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_84.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 83/100 : 254 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_85.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 84/100 : 119 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_86.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 85/100 : 370 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_87.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 86/100 : 279 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_88.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 87/100 : 202 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_89.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 88/100 : 224 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_9.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 89/100 : 405 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_90.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 90/100 : 190 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_91.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 91/100 : 104 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_92.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 92/100 : 96 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_93.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 93/100 : 58 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_94.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 94/100 : 79 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_95.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 95/100 : 222 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_96.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Start processing of batch 96/100 : 4158 sequences \n",
      "\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_97.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 97/100 : 129 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_488015107/chunk_98.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 98/100 : 183 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_488015107/chunk_99.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 99/100 : 348 sequences \n"
     ]
    }
   ],
   "source": [
    "obiuniq -m taxid -Z \\\n",
    "        vert01_raw_db.fasta.gz \\\n",
    "        > vert01_uniq_db.fasta.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "826d4d84-f446-417d-a67f-79ee222194e7",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0001] NCBI Taxdump Tar Archive detected: ../course/data/ncbitaxo_20251118.tgz \n",
      "\u001b[36mINFO\u001b[0m[0001] Loading Taxonomy nodes                       \n",
      "\u001b[36mINFO\u001b[0m[0001] NCBI Taxdump Tar Archive detected: ../course/data/ncbitaxo_20251118.tgz \n",
      "\u001b[36mINFO\u001b[0m[0001] Loading Taxonomy nodes                       \n",
      "\u001b[36mINFO\u001b[0m[0016] 2706727 Taxonomy nodes read                  \n",
      "\u001b[36mINFO\u001b[0m[0016] Loading Taxon names                          \n",
      "\u001b[36mINFO\u001b[0m[0016] 2706727 Taxonomy nodes read                  \n",
      "\u001b[36mINFO\u001b[0m[0016] Loading Taxon names                          \n",
      "\u001b[36mINFO\u001b[0m[0038] 2706727 taxon names read                     \n",
      "\u001b[36mINFO\u001b[0m[0038] Loading Merged taxa                          \n",
      "\u001b[36mINFO\u001b[0m[0038] 2706727 taxon names read                     \n",
      "\u001b[36mINFO\u001b[0m[0038] Loading Merged taxa                          \n",
      "\u001b[36mINFO\u001b[0m[0038] 93509 merged taxa read                       \n",
      "\u001b[36mINFO\u001b[0m[0038] Set as default taxonomy NCBI Taxonomy        \n",
      "\u001b[36mINFO\u001b[0m[0038] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0038] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0038] vert01_uniq_db.fasta.gz mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0038] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0038] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0038] Data is writen to stdout                     \n",
      "\u001b[33mWARN\u001b[0m[0038] GU981038_sub[458..556]: Taxid 150090 has to be updated to taxon:3371150 [Pseudosoriculus fumidus]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] KY410960_sub[517..615]: Taxid 1620840 has to be updated to taxon:2740394 [Sundasciurus altitudinis]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] KX381710_sub[181..280]: Taxid 263451 has to be updated to taxon:3370180 [Lophostoma silvicola]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] JF694095_sub[447..545]: Taxid 164531 has to be updated to taxon:3370248 [Microperoryctes ornatus]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] AY495457_sub[449..548]: Taxid 258864 has to be updated to taxon:3370479 [Ozimops planiceps]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] KY464180_sub[516..615]: Taxid 2021987 has to be updated to taxon:3370147 [Lenothrix cana]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] KY581660_sub[517..616]: Taxid 478698 has to be updated to taxon:3370289 [Mops plicatus]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] AY495540_sub[446..543]: Taxid 258960 has to be updated to taxon:2720888 [Vespadelus darlingtoni]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] AJ430552_sub[441..540]: Taxid 410298 has to be updated to taxon:3370000 [Gerbilliscus afer]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] MF038679_sub[446..543]: Taxid 2093327 has to be updated to taxon:3369970 [Eptesicus dutertreus]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] JX520511_sub[433..531]: Taxid 35665 has to be updated to taxon:3370058 [Heterogeomys hispidus]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] MN807618_sub[513..611]: Taxid 119565 has to be updated to taxon:3371118 [Montemys delectorum]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] MK211397_sub[445..543]: Taxid 114227 has to be updated to taxon:3370452 [Onychogalea frenata]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] MF038680_sub[440..537]: Taxid 258906 has to be updated to taxon:3371012 [Cnephaeus hottentotus]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] MZ708834_sub[527..626]: Taxid 224955 has to be updated to taxon:3370766 [Scaptonyx fusicauda]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] AJ389548_sub[455..553]: Taxid 35664 has to be updated to taxon:3370057 [Heterogeomys heterodus]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] KX591624_sub[446..543]: Taxid 1898424 has to be updated to taxon:2778565 [Laephotis stanleyi]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] AF203773_sub[449..548]: Taxid 110940 has to be updated to taxon:3371137 [Paremballonura atrata]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] OR257582_sub[644..744]: Taxid 3055736 has to be updated to taxon:3370154 [Leopardus pardinoides]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] JQ039218_sub[446..543]: Taxid 568927 has to be updated to taxon:3369693 [Afronycteris nanus]@species \n",
      "\u001b[33mWARN\u001b[0m[0038] OM334923_sub[449..547]: Taxid 209424 has to be updated to taxon:3370523 [Petaurista grandis]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] JF694114_sub[450..548]: Taxid 1031278 has to be updated to taxon:3370506 [Perameles notina]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] KX381233_sub[178..277]: Taxid 40227 has to be updated to taxon:3369931 [Dermanura gnomus]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] KX381261_sub[181..280]: Taxid 263451 has to be updated to taxon:3370180 [Lophostoma silvicola]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] KX754507_sub[450..549]: Taxid 224955 has to be updated to taxon:3370766 [Scaptonyx fusicauda]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] JN255346_sub[448..547]: Taxid 190653 has to be updated to taxon:3369760 [Cercartetus nana]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] AY530083_sub[459..558]: Taxid 264759 has to be updated to taxon:3370991 [Atopogale cubana]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] OP328298_sub[837..934]: Taxid 169057 has to be updated to taxon:3371009 [Cnephaeus bottae]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] HM561626_sub[498..595]: Taxid 59452 has to be updated to taxon:3371021 [Cnephaeus serotinus]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] KP202255_sub[1281..1381]: Taxid 61412 has to be updated to taxon:3369756 [Caracal auratus]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] AF263234_sub[447..543]: Taxid 124746 has to be updated to taxon:3371123 [Neoeptesicus furinalis]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] KR537889_sub[522..620]: Taxid 61085 has to be updated to taxon:3369802 [Crocidura beata]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] AY495466_sub[452..549]: Taxid 258906 has to be updated to taxon:3371012 [Cnephaeus hottentotus]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] AY495514_sub[446..543]: Taxid 177184 has to be updated to taxon:2778567 [Pseudoromicia brunnea]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] KX754505_sub[452..551]: Taxid 224955 has to be updated to taxon:3370766 [Scaptonyx fusicauda]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] MH801935_sub[521..620]: Taxid 2588392 has to be updated to taxon:3370481 [Parablarinella griselda]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] AF442092_sub[447..546]: Taxid 263451 has to be updated to taxon:3370180 [Lophostoma silvicola]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] HM106321_sub[518..617]: Taxid 55048 has to be updated to taxon:3370370 [Neogale frenata]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] AY495467_sub[447..544]: Taxid 59452 has to be updated to taxon:3371021 [Cnephaeus serotinus]@species \n",
      "\u001b[33mWARN\u001b[0m[0039] KX754504_sub[450..549]: Taxid 224955 has to be updated to taxon:3370766 [Scaptonyx fusicauda]@species \n",
      "\n",
      "\u001b[36mINFO\u001b[0m[0039] 93509 merged taxa read                       \n",
      "\u001b[36mINFO\u001b[0m[0039] Set as default taxonomy NCBI Taxonomy        \n",
      "\u001b[36mINFO\u001b[0m[0039] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0039] Reading sequences from stdin in guessed      \n",
      "\u001b[36mINFO\u001b[0m[0039] - mime type: text/fasta                      \n",
      "\u001b[36mINFO\u001b[0m[0039] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0039] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0039] Data is writen to stdout                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obiannotate --add-lca-in taxid \\\n",
    "            -t ../course/data/ncbitaxo_20251118.tgz \\\n",
    "            vert01_uniq_db.fasta.gz \\\n",
    "  | obiannotate -t ../course/data/ncbitaxo_20251118.tgz \\\n",
    "            --taxonomic-rank -Z \\\n",
    "            > vert01_lca_db.fasta.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0e7d06ec-6477-4fdd-8213-295d5f1c21ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] vert01_lca_db.fasta.gz mime type: text/fasta \n",
      "\n",
      "\n",
      "      4 clade\n",
      "     52 family\n",
      "    374 genus\n",
      "      6 no rank\n",
      "      1 order\n",
      "   3180 species\n",
      "      1 species group\n",
      "     46 subfamily\n",
      "      6 subgenus\n",
      "      1 suborder\n",
      "    180 subspecies\n",
      "      6 tribe\n"
     ]
    }
   ],
   "source": [
    "obicsv -k taxonomic_rank vert01_lca_db.fasta.gz \\\n",
    "  | tail -n +2 \\\n",
    "  | sort \\\n",
    "  | uniq -c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "2b5a78c0-f1f8-4e6d-80f0-076cac6a19b5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] NCBI Taxdump Tar Archive detected: ../course/data/ncbitaxo_20251118.tgz \n",
      "\u001b[36mINFO\u001b[0m[0000] Loading Taxonomy nodes                       \n",
      "\u001b[36mINFO\u001b[0m[0007] 2706727 Taxonomy nodes read                  \n",
      "\u001b[36mINFO\u001b[0m[0007] Loading Taxon names                          \n",
      "\u001b[36mINFO\u001b[0m[0016] 2706727 taxon names read                     \n",
      "\u001b[36mINFO\u001b[0m[0016] Loading Merged taxa                          \n",
      "\u001b[36mINFO\u001b[0m[0016] 93509 merged taxa read                       \n",
      "\u001b[36mINFO\u001b[0m[0016] Set as default taxonomy NCBI Taxonomy        \n",
      "\u001b[36mINFO\u001b[0m[0016] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0016] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0016] vert01_lca_db.fasta.gz mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0016] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0016] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0016] Data is writen to stdout                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obigrep --require-rank family \\\n",
    "        -t ../course/data/ncbitaxo_20251118.tgz \\\n",
    "        --update-taxid \\\n",
    "        vert01_lca_db.fasta.gz \\\n",
    "        > vert01_clean_db.fasta.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "713eea58-4ce9-4444-a945-61d3e24aab63",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] vert01_clean_db.fasta.gz mime type: text/fasta \n",
      "\n",
      "\n",
      "     52 family\n",
      "    374 genus\n",
      "      6 no rank\n",
      "   3180 species\n",
      "      1 species group\n",
      "     46 subfamily\n",
      "      6 subgenus\n",
      "    180 subspecies\n",
      "      6 tribe\n"
     ]
    }
   ],
   "source": [
    "obicsv -k taxonomic_rank vert01_clean_db.fasta.gz \\\n",
    "  | tail -n +2 \\\n",
    "  | sort \\\n",
    "  | uniq -c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "374322c4-8ce0-46a2-b5af-aa6900ee8735",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] vert01_clean_db.fasta.gz mime type: text/fasta \n",
      "\n",
      "\n",
      "     21 family\n",
      "    188 genus\n",
      "      3 no rank\n",
      "   2021 species\n",
      "      1 species group\n",
      "     23 subfamily\n",
      "      3 subgenus\n",
      "     98 subspecies\n",
      "      2 tribe\n"
     ]
    }
   ],
   "source": [
    "obicsv -k taxid vert01_clean_db.fasta.gz \\\n",
    "  | tail -n +2 \\\n",
    "  | sort \\\n",
    "  | uniq \\\n",
    "  | awk -F'@' '{print $2}' \\\n",
    "  | sort \\\n",
    "  | uniq -c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "4e1cd2a1-e0bc-4301-8b60-69bba99ff384",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] NCBI Taxdump Tar Archive detected: ../course/data/ncbitaxo_20251118.tgz \n",
      "\u001b[36mINFO\u001b[0m[0000] Loading Taxonomy nodes                       \n",
      "\u001b[36mINFO\u001b[0m[0007] 2706727 Taxonomy nodes read                  \n",
      "\u001b[36mINFO\u001b[0m[0007] Loading Taxon names                          \n",
      "\u001b[36mINFO\u001b[0m[0017] 2706727 taxon names read                     \n",
      "\u001b[36mINFO\u001b[0m[0017] Loading Merged taxa                          \n",
      "\u001b[36mINFO\u001b[0m[0017] 93509 merged taxa read                       \n",
      "\u001b[36mINFO\u001b[0m[0017] Set as default taxonomy NCBI Taxonomy        \n",
      "\u001b[36mINFO\u001b[0m[0017] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0017] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0017] vert01_clean_db.fasta.gz mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0017] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0017] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0017] Data is writen to stdout                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obiannotate --taxonomic-path \\\n",
    "            -t ../course/data/ncbitaxo_20251118.tgz \\\n",
    "            -Z \\\n",
    "            vert01_clean_db.fasta.gz \\\n",
    "            > vert01_ref_db.fasta.gz "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "abf03b79-7a61-4d70-a212-7f7ee7e2319e",
   "metadata": {},
   "source": [
    "## Analyzing the metabarcoding data\n",
    "\n",
    "### Step 1: Pairing the Reads\n",
    "\n",
    "We'll use the `obipairing` *OBITools4* command."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "ee953292-a9ed-4a2e-bec3-2482dbf0f120",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] ../course/data/Wolf_diet/wolf_F.fastq mime type: text/fastq \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] ../course/data/Wolf_diet/wolf_R.fastq mime type: text/fastq \n",
      "\u001b[36mINFO\u001b[0m[0000] Start of the sequence Pairing using 16 workers \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Initializing the DNA Scoring matrix          \n",
      "                                                - Reading sequences (15525/-, 59229 it/s) [0s] | Reading sequences (19407/-, 43953 it/s) [0s] | Reading sequences (27170/-, 60156 it/s) [0s] / Reading sequences (23288/-, 42042 it/s) [0s] - Reading sequences (31052/-, 47481 it/s) [0s] | Reading sequences (38815/-, 47481 it/s) [0s] \\ Reading sequences (31052/-, 27527 it/s) [1s] - Reading sequences (38815/-, 27527 it/s) [1s] - Reading sequences (41393/-, 30190 it/s) [1s] \\ Reading sequences (41393/-, 27527 it/s) [1s] \n",
      "                                                | Reading sequences (45276/-, 30190 it/s) [1s] \n",
      "\u001b[36mINFO\u001b[0m[0003] End of the sequence Pairing                  \n"
     ]
    }
   ],
   "source": [
    "obipairing -F ../course/data/Wolf_diet/wolf_F.fastq \\\n",
    "           -R ../course/data/Wolf_diet/wolf_R.fastq \\\n",
    "           -Z \\\n",
    "           > wolf_paired.fastq.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "1185355d-aaf6-4d48-b2dc-340c500799c6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed      \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_paired.fastq.gz mime type: text/fastq   \n",
      "\u001b[36mINFO\u001b[0m[0000] mode alignment                               \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "/ Reading sequences (5073/-, 50093 it/s) [0s] \u001b[36mINFO\u001b[0m[0000] - mime type: text/fastq                      \n",
      "                                                Reading sequences (13538/-, 46268 it/s) [0s] | Reading sequences (15231/-, 37250 it/s) [0s] / Reading sequences (18616/-, 31567 it/s) [0s]                                        | Reading sequences (3344/-, 8180 it/s) [0s] \\ Reading sequences (25387/-, 31567 it/s) [0s]                                        - Writing CSV (6692/-, 11124 it/s) [0s] | Reading sequences (27086/-, 31567 it/s) [0s] / Reading sequences (30462/-, 31567 it/s) [0s] | Reading sequences (13397/-, 11125 it/s) [0s] | Writing CSV (13397/-, 11124 it/s) [0s] - Reading sequences (33849/-, 30908 it/s) [1s]                                                \\ Reading sequences (15071/-, 13860 it/s) [1s] \\ Writing CSV (15071/-, 13859 it/s) [1s] | Reading sequences (20101/-, 13860 it/s) [1s] | Writing CSV (20101/-, 13859 it/s) [1s] - Reading sequences (41896/-, 30908 it/s) [1s] \n",
      "                                               / Reading sequences (23458/-, 13860 it/s) [1s] / Writing CSV (23458/-, 13859 it/s) [1s] \\ Reading sequences (26804/-, 13860 it/s) [1s] \\ Writing CSV (26804/-, 13859 it/s) [1s] | Reading sequences (34097/-, 13860 it/s) [1s] | Writing CSV (34097/-, 13859 it/s) [1s] \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obigrep -a mode=alignment wolf_paired.fastq.gz \\\n",
    " | obicsv -k ali_length -k score_norm \\\n",
    " > wolf_paired_scores.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "20fc7a51-1d14-4a2d-a50c-377c28303b86",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed      \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_paired.fastq.gz mime type: text/fastq   \n",
      "\u001b[36mINFO\u001b[0m[0000] mode alignment                               \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "/ Reading sequences (3378/-, 17818 it/s) [0s] \u001b[36mINFO\u001b[0m[0000] - mime type: text/fastq                      \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "                                                Reading sequences (8457/-, 28787 it/s) [0s] | Reading sequences (13538/-, 27278 it/s) [0s] - Reading sequences (15228/-, 22019 it/s) [0s] / Reading sequences (3348/-, 6690 it/s) [0s] | Reading sequences (18614/-, 22019 it/s) [0s] - Reading sequences (5021/-, 6690 it/s) [0s] - Reading sequences (23692/-, 22019 it/s) [1s] | Reading sequences (27080/-, 21165 it/s) [1s] / Reading sequences (28768/-, 21165 it/s) [1s] \\ Reading sequences (10044/-, 8925 it/s) [1s] - Reading sequences (35544/-, 21165 it/s) [1s] | Reading sequences (16749/-, 8925 it/s) [1s] | Reading sequences (37232/-, 21165 it/s) [1s] - Reading sequences (18427/-, 8925 it/s) [1s] / Reading sequences (42318/-, 24145 it/s) [1s] \\ Reading sequences (43585/-, 24145 it/s) [1s] | Reading sequences (20101/-, 11693 it/s) [1s] \n",
      "                                               / Reading sequences (23458/-, 11693 it/s) [1s] - Reading sequences (26803/-, 11693 it/s) [1s] \\ Reading sequences (31830/-, 11693 it/s) [1s] \n"
     ]
    }
   ],
   "source": [
    "obigrep  -a mode=alignment wolf_paired.fastq.gz \\\n",
    "  | obigrep -p 'annotations.score_norm >= 0.96 && \n",
    "                annotations.ali_length > 55 && \n",
    "                annotations.ali_length < 65' \\\n",
    "            -Z \\\n",
    "            > wolf_paired_good.fastq.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "bfa5e59a-572e-4386-b948-83ab36b70c31",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_paired.fastq.gz mime type: text/fastq   \n",
      "                                                Reading sequences (16921/-, 47077 it/s) [0s] | Reading sequences (23692/-, 51190 it/s) [0s] / Reading sequences (32155/-, 56006 it/s) [0s] - Reading sequences (40629/-, 56006 it/s) [0s] \\ Reading sequences (42321/-, 56006 it/s) [0s] \n",
      "| entities |         n |\n",
      "| -------- | --------- |\n",
      "| variants |    45,276 |\n",
      "| reads    |    45,276 |\n",
      "| symbols  | 7,386,937 |\n"
     ]
    }
   ],
   "source": [
    "obicount wolf_paired.fastq.gz | csvlook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "849dfb1d-25b4-47bf-a99e-f2b19a2e1344",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_paired_good.fastq.gz mime type: text/fastq \n",
      "                                                Reading sequences (13765/-, 54484 it/s) [0s] \\ Reading sequences (26237/-, 74257 it/s) [0s] \n",
      "| entities |         n |\n",
      "| -------- | --------- |\n",
      "| variants |    27,955 |\n",
      "| reads    |    27,955 |\n",
      "| symbols  | 4,302,597 |\n"
     ]
    }
   ],
   "source": [
    "obicount wolf_paired_good.fastq.gz | csvlook"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f776e779-3cbc-46c5-a609-46ac9d158550",
   "metadata": {},
   "source": [
    "## Extracting the barcode\n",
    "\n",
    "Using the `obimultiplex`command. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "b60a82b7-7d3c-48c7-bf6f-71a01f5a20d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_paired_good.fastq.gz mime type: text/fastq \n",
      "\u001b[36mINFO\u001b[0m[0000] Reading NGSFilter file: ../course/data/Wolf_diet/wolf_data_wolf_diet_ngsfilter.csv \n",
      "\u001b[36mINFO\u001b[0m[0000] No BOM detected                              \n",
      "\u001b[36mINFO\u001b[0m[0000] NGSFilter configuration mimetype: text/ngsfilter-csv \n",
      "\u001b[36mINFO\u001b[0m[0000] 3 parameters found                           \n",
      "\u001b[36mINFO\u001b[0m[0000] Read 5 records                               \n",
      "\u001b[36mINFO\u001b[0m[0000] First record: [experiment sample sample_tag forward_primer reverse_primer] \n",
      "\u001b[36mINFO\u001b[0m[0000] Set tag matching mode to strict              \n",
      "\u001b[36mINFO\u001b[0m[0000] Set global allowed primer mismatches to 2    \n",
      "\u001b[36mINFO\u001b[0m[0000] Disallows indels for primer matching         \n",
      "\u001b[36mINFO\u001b[0m[0000] Unassigned sequences saved in file: wolf_unassign.fastq.gz \n",
      "\u001b[36mINFO\u001b[0m[0000] Sequence demultiplexing using 16 workers     \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "                                                Reading sequences (10324/-, 42547 it/s) [0s] | Reading sequences (12045/-, 27315 it/s) [0s] \\ Reading sequences (17208/-, 22941 it/s) [0s] - Reading sequences (22802/-, 22941 it/s) [1s] \\ Reading sequences (27955/-, 22941 it/s) [1s] \n"
     ]
    }
   ],
   "source": [
    "obimultiplex -s ../course/data/Wolf_diet/wolf_data_wolf_diet_ngsfilter.csv \\\n",
    "             -u wolf_unassign.fastq.gz \\\n",
    "             -Z \\\n",
    "             wolf_paired_good.fastq.gz \\\n",
    "             > wolf_assign.fastq.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9e7e47ba-dedd-4cc5-9952-a6c3d94d18f2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_unassign.fastq.gz mime type: text/fastq \n",
      "\n",
      "| entities |      n |\n",
      "| -------- | ------ |\n",
      "| variants |    243 |\n",
      "| reads    |    243 |\n",
      "| symbols  | 23,598 |\n"
     ]
    }
   ],
   "source": [
    "obicount wolf_unassign.fastq.gz | csvlook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "9942e6fe-83a1-4bb9-acea-f9479cae1930",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_assign.fastq.gz mime type: text/fastq   \n",
      "                                               eading sequences (4415/-, 14719 it/s) [0s] / Reading sequences (8829/-, 17607 it/s) [0s] - Reading sequences (11478/-, 17607 it/s) [0s] \\ Reading sequences (14125/-, 17607 it/s) [0s] / Reading sequences (16774/-, 17607 it/s) [0s] - Reading sequences (19425/-, 19283 it/s) [1s] \\ Reading sequences (23836/-, 19283 it/s) [1s] \n",
      "| entities |         n |\n",
      "| -------- | --------- |\n",
      "| variants |    27,712 |\n",
      "| reads    |    27,712 |\n",
      "| symbols  | 2,580,858 |\n"
     ]
    }
   ],
   "source": [
    "obicount wolf_assign.fastq.gz | csvlook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ca0c2bb5-b457-4787-9f81-c9a892e9cd64",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_assign.fastq.gz mime type: text/fastq   \n",
      "                                                Reading sequences (4416/-, 18923 it/s) [0s] - Writing CSV (4416/-, 18918 it/s) [0s] | Reading sequences (8831/-, 20349 it/s) [0s] | Writing CSV (8831/-, 20347 it/s) [0s] / Reading sequences (11477/-, 21376 it/s) [0s] / Writing CSV (11477/-, 21374 it/s) [0s] \\ Reading sequences (14125/-, 21376 it/s) [0s] \\ Writing CSV (14125/-, 21374 it/s) [0s] | Reading sequences (15008/-, 21376 it/s) [0s] | Writing CSV (15008/-, 21374 it/s) [0s] - Reading sequences (19425/-, 21376 it/s) [1s] - Writing CSV (19425/-, 21374 it/s) [1s] \\ Reading sequences (20308/-, 18061 it/s) [1s] \\ Writing CSV (20308/-, 18058 it/s) [1s] / Reading sequences (23836/-, 18061 it/s) [1s] / Writing CSV (23836/-, 18058 it/s) [1s] \n",
      "\n",
      "   6447 13a_F730603\n",
      "   6066 15a_F730814\n",
      "   9567 26a_F040644\n",
      "   5632 29a_F260619\n",
      "      1 sample\n"
     ]
    }
   ],
   "source": [
    "obicsv -k sample wolf_assign.fastq.gz \\\n",
    "  | tail -n +2 \\\n",
    "  | sort \\\n",
    "  | uniq -c"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "e1b06db5-099d-4540-a781-0dbfa797d9b6",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_assign.fastq.gz mime type: text/fastq   \n",
      "\u001b[36mINFO\u001b[0m[0000] Running dereplication on disk with 100 chunks \n",
      "\u001b[36mINFO\u001b[0m[0000] Keep sigletons in the output                 \n",
      "\u001b[36mINFO\u001b[0m[0000] Starting data splitting                      \n",
      "                                                 Reading sequences (4415/-, 16703 it/s) [0s] \\ Splitting data set (4415/-, 12929 it/s) [0s] | Reading sequences (6179/-, 14028 it/s) [0s] | Splitting data set (7062/-, 15883 it/s) [0s] / Reading sequences (8829/-, 16246 it/s) [0s] - Splitting data set (8829/-, 13801 it/s) [0s] - Reading sequences (10595/-, 16246 it/s) [0s] \\ Splitting data set (12361/-, 13801 it/s) [0s] \\ Reading sequences (15008/-, 16246 it/s) [0s] | Splitting data set (15008/-, 13801 it/s) [0s] | Reading sequences (15893/-, 16246 it/s) [0s] - Splitting data set (18542/-, 13801 it/s) [1s] - Reading sequences (19425/-, 18704 it/s) [1s] \\ Splitting data set (20308/-, 17161 it/s) [1s] \\ Reading sequences (21191/-, 18704 it/s) [1s] / Splitting data set (25604/-, 17161 it/s) [1s] / Reading sequences (26484/-, 18704 it/s) [1s] \n",
      "\n",
      "\u001b[36mINFO\u001b[0m[0001] Data splitted over 100 batches               \n",
      "\u001b[36mINFO\u001b[0m[0001] End of the data splitting                    \n",
      "\u001b[36mINFO\u001b[0m[0001] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0001] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0001] Data is writen to stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_0.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 0/100 : 37 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_1.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 1/100 : 14 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_10.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 2/100 : 13 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_11.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 3/100 : 13 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_12.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 4/100 : 355 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_13.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 5/100 : 21 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_14.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 6/100 : 18 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_15.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 7/100 : 38 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_16.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 8/100 : 17 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_17.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 9/100 : 11 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_18.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 10/100 : 7 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_19.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 11/100 : 20 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_2.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 12/100 : 25 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_20.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 13/100 : 7756 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_21.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 14/100 : 43 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_22.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 15/100 : 27 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_23.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 16/100 : 10 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_24.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 17/100 : 23 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_25.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 18/100 : 15 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_26.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 19/100 : 16 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_27.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 20/100 : 46 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_28.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 21/100 : 31 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_29.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 22/100 : 15 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_3.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 23/100 : 48 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_30.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 24/100 : 46 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_31.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 25/100 : 63 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_32.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 26/100 : 31 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_33.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 27/100 : 16 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_34.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 28/100 : 38 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_35.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 29/100 : 42 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_36.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 30/100 : 163 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_37.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 31/100 : 15 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_38.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 32/100 : 40 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_39.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0001] Start processing of batch 33/100 : 24 sequences \n",
      "\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_4.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 34/100 : 5825 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_40.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 35/100 : 20 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_41.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 36/100 : 23 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_42.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 37/100 : 97 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_43.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 38/100 : 27 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_44.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 39/100 : 1863 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_45.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 40/100 : 42 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_46.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 41/100 : 40 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_47.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 42/100 : 65 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_48.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 43/100 : 39 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_49.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 44/100 : 43 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_5.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 45/100 : 38 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_50.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 46/100 : 72 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_51.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 47/100 : 12 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_52.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 48/100 : 15 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_53.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 49/100 : 75 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_54.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 50/100 : 42 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_55.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 51/100 : 39 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_56.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 52/100 : 30 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_57.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 53/100 : 28 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_58.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 54/100 : 40 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_59.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 55/100 : 32 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_6.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 56/100 : 35 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_60.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 57/100 : 35 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_61.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 58/100 : 8405 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_62.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 59/100 : 29 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_63.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 60/100 : 25 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_64.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 61/100 : 79 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_65.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 62/100 : 27 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_66.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 63/100 : 25 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_67.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 64/100 : 21 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_68.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 65/100 : 23 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_69.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 66/100 : 21 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_7.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 67/100 : 48 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_70.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 68/100 : 15 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_71.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 69/100 : 26 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_72.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 70/100 : 29 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_73.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 71/100 : 13 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_74.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 72/100 : 26 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_75.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 73/100 : 15 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_76.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 74/100 : 15 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_77.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 75/100 : 21 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_78.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 76/100 : 57 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_79.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 77/100 : 18 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_8.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 78/100 : 21 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_80.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 79/100 : 17 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_81.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 80/100 : 15 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_82.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 81/100 : 55 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_83.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 82/100 : 30 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_84.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 83/100 : 47 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_85.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 84/100 : 37 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_86.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 85/100 : 10 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_87.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 86/100 : 21 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_88.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 87/100 : 26 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_89.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 88/100 : 9 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_9.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 89/100 : 47 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_90.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 90/100 : 42 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_91.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 91/100 : 13 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_92.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 92/100 : 36 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_93.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 93/100 : 26 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_94.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 94/100 : 235 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_95.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 95/100 : 29 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_96.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 96/100 : 304 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_97.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 97/100 : 29 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_98.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 98/100 : 18 sequences \n",
      "\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_99.fastx mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0002] Start processing of batch 99/100 : 33 sequences \n"
     ]
    }
   ],
   "source": [
    "obiuniq -m sample -Z \\\n",
    "        wolf_assign.fastq.gz \\\n",
    "        > wolf_uniq.fasta.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "aeaa4c26-19b7-4485-b5bd-d4d450a6d1ac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_uniq.fasta.gz mime type: text/fasta     \n",
      "\n",
      "| entities |      n |\n",
      "| -------- | ------ |\n",
      "| variants |    989 |\n",
      "| reads    | 27,712 |\n",
      "| symbols  | 97,427 |\n"
     ]
    }
   ],
   "source": [
    "obicount wolf_uniq.fasta.gz | csvlook"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d00e1c45-c378-4899-af45-0969d62a9fdd",
   "metadata": {},
   "source": [
    "## Dataset cleaning\n",
    "\n",
    "First step: looking at singleton sequences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "cbd7039e-ce10-41f2-888c-ce09a886ab13",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_uniq.fasta.gz mime type: text/fasta     \n",
      "\n",
      "\n",
      "      1 count\n",
      "    543 1\n",
      "    125 2\n",
      "     86 3\n",
      "     66 4\n",
      "     36 5\n",
      "     14 6\n",
      "     36 7\n",
      "     19 8\n",
      "     10 9\n"
     ]
    }
   ],
   "source": [
    "obicsv -k count wolf_uniq.fasta.gz \\\n",
    "  | sort -n \\\n",
    "  | uniq -c \\\n",
    "  | head -n 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "b296bbb9-d187-4485-9b4c-b1df5c9e3bca",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_uniq.fasta.gz mime type: text/fasta     \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obigrep -c 2 -Z \\\n",
    "        wolf_uniq.fasta.gz \\\n",
    "        > wolf_nosingleton.fasta.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "a61b07fb-3215-435c-917c-cff21453b056",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_nosingleton.fasta.gz mime type: text/fasta \n",
      "\n",
      "| entities |      n |\n",
      "| -------- | ------ |\n",
      "| variants |    446 |\n",
      "| reads    | 27,169 |\n",
      "| symbols  | 43,760 |\n"
     ]
    }
   ],
   "source": [
    "obicount wolf_nosingleton.fasta.gz | csvlook"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a496d5a-afb5-4be8-97e7-21e7fefcd6ff",
   "metadata": {},
   "source": [
    "Second step: Look at the sequence length distribution."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "bab297cb-33c9-4488-8100-ca65fe6e9187",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_nosingleton.fasta.gz mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed      \n",
      "\n",
      "\u001b[36mINFO\u001b[0m[0000] - mime type: text/fasta                      \n",
      "\n",
      "\n",
      "      1 seq_length\n",
      "      5 4\n",
      "      1 5\n",
      "      1 8\n",
      "    179 99\n",
      "    259 100\n",
      "      1 106\n"
     ]
    }
   ],
   "source": [
    "obiannotate --length \\\n",
    "            wolf_nosingleton.fasta.gz \\\n",
    "| obicsv -k seq_length \\\n",
    "| sort -n \\\n",
    "| uniq -c "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "3966fc20-6629-4097-bde3-c3027d744ab1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_nosingleton.fasta.gz mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obigrep -l 50 -Z \\\n",
    "        wolf_nosingleton.fasta.gz \\\n",
    "        > wolf_noshort.fasta.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "1865ca6b-0dd8-43d5-85c2-239d9e09327a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_noshort.fasta.gz mime type: text/fasta  \n",
      "\n",
      "| entities |      n |\n",
      "| -------- | ------ |\n",
      "| variants |    439 |\n",
      "| reads    | 25,290 |\n",
      "| symbols  | 43,727 |\n"
     ]
    }
   ],
   "source": [
    "obicount wolf_noshort.fasta.gz | csvlook"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "396cb1b7-4816-4fb9-be57-b1654c8d51c1",
   "metadata": {},
   "source": [
    "Step 3: Look at ambiguous nucleotides. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "88d9da85-2eb3-4ed8-aa7c-a6830aa946fe",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed      \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_noshort.fasta.gz mime type: text/fasta  \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\n",
      "\u001b[36mINFO\u001b[0m[0000] - mime type: text/fasta                      \n",
      "\n",
      "| entities |   n |\n",
      "| -------- | --- |\n",
      "| variants |   4 |\n",
      "| reads    |  10 |\n",
      "| symbols  | 399 |\n"
     ]
    }
   ],
   "source": [
    "obigrep -v -s '^[acgt]+$' \\\n",
    "        wolf_noshort.fasta.gz \\\n",
    "| obicount | csvlook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "c1d3c092-0af3-404a-a880-028dfdc99a87",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_noshort.fasta.gz mime type: text/fasta  \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obigrep -Z -s '^[acgt]+$' \\\n",
    "        wolf_noshort.fasta.gz \\\n",
    "        > wolf_acgt.fasta.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "267a0727-ea1f-4933-8615-5b1f98b93e5c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_acgt.fasta.gz mime type: text/fasta     \n",
      "\n",
      "| entities |      n |\n",
      "| -------- | ------ |\n",
      "| variants |    435 |\n",
      "| reads    | 25,280 |\n",
      "| symbols  | 43,328 |\n"
     ]
    }
   ],
   "source": [
    "obicount wolf_acgt.fasta.gz | csvlook"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dbfcc481-ca4d-485e-ac1f-023804a60b11",
   "metadata": {},
   "source": [
    "### Running obiclean\n",
    "\n",
    "#### Evaluating the ration threshold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "7935c09f-dcf4-4d73-a7a0-3014171961a8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_acgt.fasta.gz mime type: text/fasta     \n",
      "\n",
      "\u001b[36mINFO\u001b[0m[0000] Sequence dataset of 435 sequeences loaded    \n",
      "\u001b[36mINFO\u001b[0m[0000] Dataset composed of 4 samples                \n",
      "                                                                       ror graph]  44% |██████         | (18389952 it/s) [0s:0s][One error graph]  59% |████████       | (17964160 it/s) [0s:0s][One error graph] 100% |███████████████| (19531142 it/s)[Annotate sequence status]  50% |███████        | (786 it/s) [0s:0s][Annotate sequence status]  75% |███████████    | (1141 it/s) [0s:0s][Annotate sequence status] 100% |███████████████| (1467 it/s)[Save GML Graph files]  50% |███████        | (1060 it/s) [0s:0s][Save GML Graph files]  75% |███████████    | (1202 it/s) [0s:0s][Save GML Graph files] 100% |███████████████| (1212 it/s)[Save CSV stat ratio file]   8% |█              | (81360 it/s) [0s:0s][Save CSV stat ratio file]  12% |█              | (74772 it/s) [0s:0s][Save CSV stat ratio file]  16% |██             | (73066 it/s) [0s:0s][Save CSV stat ratio file]  20% |███            | (72208 it/s) [0s:0s][Save CSV stat ratio file]  24% |███            | (70560 it/s) [0s:0s][Save CSV stat ratio file]  28% |████           | (71283 it/s) [0s:0s][Save CSV stat ratio file]  32% |████           | (66951 it/s) [0s:0s][Save CSV stat ratio file]  36% |█████          | (30132 it/s) [0s:0s][Save CSV stat ratio file]  40% |██████         | (30654 it/s) [0s:0s][Save CSV stat ratio file]  44% |██████         | (31945 it/s) [0s:0s][Save CSV stat ratio file]  48% |███████        | (33099 it/s) [0s:0s][Save CSV stat ratio file]  52% |███████        | (34555 it/s) [0s:0s][Save CSV stat ratio file]  56% |████████       | (35574 it/s) [0s:0s][Save CSV stat ratio file]  60% |█████████      | (34914 it/s) [0s:0s][Save CSV stat ratio file]  64% |█████████      | (36083 it/s) [0s:0s][Save CSV stat ratio file]  68% |██████████     | (36119 it/s) [0s:0s][Save CSV stat ratio file]  72% |██████████     | (36945 it/s) [0s:0s][Save CSV stat ratio file]  76% |███████████    | (37940 it/s) [0s:0s][Save CSV stat ratio file]  80% |████████████   | (38694 it/s) [0s:0s][Save CSV stat ratio file]  84% |████████████   | (39511 it/s) [0s:0s][Save CSV stat ratio file]  88% |█████████████  | (39952 it/s) [0s:0s][Save CSV stat ratio file]  92% |█████████████  | (38738 it/s) [0s:0s][Save CSV stat ratio file]  96% |██████████████ | (38713 it/s) [0s:0s][Save CSV stat ratio file] 100% |███████████████| (39438 it/s)\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n"
     ]
    }
   ],
   "source": [
    "obiclean --save-ratio wolf_ratio.csv \\\n",
    "         --save-graph wolf_graph \\\n",
    "         wolf_acgt.fasta.gz \\\n",
    "         > wolf_obiclean_1.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "eeabe3a8-7069-469b-bc3c-5a0faef90fd7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_acgt.fasta.gz mime type: text/fasta     \n",
      "\n",
      "\u001b[36mINFO\u001b[0m[0000] Sequence dataset of 435 sequeences loaded    \n",
      "\u001b[36mINFO\u001b[0m[0000] Dataset composed of 4 samples                \n",
      "                                                                              ph]  44% |██████         | (11275210 it/s) [0s:0s][One error graph]  59% |████████       | (10893174 it/s) [0s:0s][One error graph] 100% |███████████████| (12325209 it/s)[Filter graph on abundance ratio]  50% |███████        | (54237 it/s) [0s:0s][Filter graph on abundance ratio]  75% |███████████    | (47337 it/s) [0s:0s][Filter graph on abundance ratio] 100% |███████████████| (44017 it/s)[Annotate sequence status]  50% |███████        | (18476 it/s) [0s:0s][Annotate sequence status]  75% |███████████    | (15161 it/s) [0s:0s][Annotate sequence status] 100% |███████████████| (3483 it/s)\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n"
     ]
    }
   ],
   "source": [
    "obiclean --detect-chimera \\\n",
    "         -r 0.1 -H \\\n",
    "         wolf_acgt.fasta.gz \\\n",
    "         > wolf_obiclean_2.fasta "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "3742381b-c1c8-4e6e-8400-741d3c54e4a9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_obiclean_2.fasta mime type: text/fasta  \n",
      "\n",
      "| entities |      n |\n",
      "| -------- | ------ |\n",
      "| variants |     30 |\n",
      "| reads    | 22,608 |\n",
      "| symbols  |  2,987 |\n"
     ]
    }
   ],
   "source": [
    "obicount wolf_obiclean_2.fasta | csvlook"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ee6152c-d6cf-4e53-91f9-3cab814de1dd",
   "metadata": {},
   "source": [
    "## Taxonomical assignment\n",
    "\n",
    "using `obitag`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "56245e8e-2eda-43d9-bfcd-031334780c74",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 32                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_obiclean_2.fasta mime type: text/fasta  \n",
      "\u001b[36mINFO\u001b[0m[0000] ./vert01_ref_db.fasta.gz mime type: text/fasta \n",
      "\u001b[36mINFO\u001b[0m[0000] Set as default taxonomy taxon                \n",
      "/ Reading sequences (1/-, 5 it/s) [0s] \n",
      "\u001b[36mINFO\u001b[0m[0000] 3851 reference sequences conserved on 3851   \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n"
     ]
    }
   ],
   "source": [
    "obitag -R ./vert01_ref_db.fasta.gz \\\n",
    "       --save-db ./vert01_ref_db_indexed.fasta \\\n",
    "       wolf_obiclean_2.fasta \\\n",
    "       > wolf_taxon.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "4b0663f1-cecb-4477-9d65-59d964db3e6d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed      \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_taxon.fasta mime type: text/fasta       \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\n",
      "\u001b[36mINFO\u001b[0m[0000] - mime type: text/fasta                      \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obiannotate --number wolf_taxon.fasta \\\n",
    "| obiannotat e --set-identifier 'sprintf(\"MOTU_%03d\", annotations.seq_number)' \\\n",
    "      > wolf_short_id.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "4a658deb-105b-4d80-88c4-243af836f577",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_short_id.fasta mime type: text/fasta    \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obigrep -p 'max(annotations.obiclean_weight) >= 100' \\\n",
    "        wolf_short_id.fasta \\\n",
    "        > wolf_no_rare.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "7d59699a-758f-485b-81df-c73388ffafe7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_no_rare.fasta mime type: text/fasta     \n",
      "\u001b[36mINFO\u001b[0m[0000] On output use JSON headers                   \n",
      "\u001b[36mINFO\u001b[0m[0000] Output is done on stdout                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout                     \n",
      "\n"
     ]
    }
   ],
   "source": [
    "obiannotate -k count \\\n",
    "            -k merged_sample \\\n",
    "            -k obiclean_weight \\\n",
    "            -k obitag_bestmatch \\\n",
    "            -k obitag_bestid \\\n",
    "            -k obitag_rank \\\n",
    "            -k taxid \\\n",
    "            wolf_no_rare.fasta \\\n",
    "            > wolf_taxon_cleaned.fasta"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "dc7a771d-6942-49cc-9976-ff198b4df3ea",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_taxon_cleaned.fasta mime type: text/fasta \n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "obicsv -i -s --auto \\\n",
    "       wolf_taxon_cleaned.fasta \\\n",
    "       > wolf_taxon_cleaned.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "b1605913-f581-4120-a42a-e2a181ea7aa7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[36mINFO\u001b[0m[0000] Number of workers set 16                     \n",
      "\u001b[36mINFO\u001b[0m[0000] Found 1 files to process                     \n",
      "\u001b[36mINFO\u001b[0m[0000] wolf_no_rare.fasta mime type: text/fasta     \n",
      "\n",
      "| taxon                                    | id       | obitag_bestid | 13a_F730603 | 15a_F730814 | 26a_F040644 | 29a_F260619 |\n",
      "| ---------------------------------------- | -------- | ------------- | ----------- | ----------- | ----------- | ----------- |\n",
      "| taxon:9611 [Canis]@genus                 | MOTU_066 |        1.000… |           9 |           4 |         328 |           1 |\n",
      "| taxon:9992 [Marmota]@genus               | MOTU_006 |        0.990… |           0 |           0 |       8,744 |           0 |\n",
      "| taxon:35500 [Pecora]@infraorder          | MOTU_014 |        0.950… |           0 |           0 |           0 |         152 |\n",
      "| taxon:9860 [Cervus elaphus]@species      | MOTU_017 |        1.000… |       6,192 |           0 |           0 |           0 |\n",
      "| taxon:55153 [Sciuridae]@family           | MOTU_020 |        0.949… |           0 |           0 |         146 |           0 |\n",
      "| taxon:9858 [Capreolus capreolus]@species | MOTU_039 |        1.000… |           0 |       5,975 |           0 |       3,404 |\n",
      "|                                          |          |               |             |             |             |             |\n"
     ]
    }
   ],
   "source": [
    "obimatrix --transpose \\\n",
    "          -k id \\\n",
    "          -k taxid \\\n",
    "          -k obitag_bestid \\\n",
    "          --map obiclean_weight \\\n",
    "          wolf_no_rare.fasta | csvlook"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "db64f7f0-c891-488d-a724-66644c2348f7",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Bash",
   "language": "bash",
   "name": "bash"
  },
  "language_info": {
   "codemirror_mode": "shell",
   "file_extension": ".sh",
   "mimetype": "text/x-sh",
   "name": "bash"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}