1736 lines
99 KiB
Plaintext
1736 lines
99 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d74a3346-9e41-42db-8134-6cc3e697dafc",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Wolf diet analysis\n",
|
|
"\n",
|
|
"## Preparing the reference database\n",
|
|
"\n",
|
|
"We'll use a small version of *Genbank* containing only mammal sequences shorter than 20kb.\n",
|
|
"\n",
|
|
"```bash\n",
|
|
"obigrep -L 20000 -Z \\\n",
|
|
" ../course/data/Genbank/Release_264 \\\n",
|
|
" > gb264_small_mam.fasta.gz\n",
|
|
"```\n",
|
|
"\n",
|
|
"On that small DB I'll run `obipcr`.\n",
|
|
"\n",
|
|
"Vertebrate primers:\n",
|
|
"\n",
|
|
"- forward: TTAGATACCCCACTATGC\n",
|
|
"- reverse: TAGAACAGGCTCCTCTAG\n",
|
|
"\n",
|
|
"We'll allow for 4 mismatches at most on each primer."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "c02aea46-81db-4131-a1c5-7cc4ede91636",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 32 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] ../course/data/Genbank/Release_264/small.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obipcr --forward TTAGATACCCCACTATGC \\\n",
|
|
" --reverse TAGAACAGGCTCCTCTAG \\\n",
|
|
" -L 200 -e 4 -Z \\\n",
|
|
" --no-progressbar \\\n",
|
|
" ../course/data/Genbank/Release_264/small.fasta.gz \\\n",
|
|
" > vert01_raw_db.fasta.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "3749e102-9e4a-4d33-8ec3-34fb08db6bca",
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"jupyter": {
|
|
"outputs_hidden": true
|
|
}
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] vert01_raw_db.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Running dereplication on disk with 100 chunks \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Keep sigletons in the output \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Starting data splitting \n",
|
|
" - Reading sequences (20678/-, 72999 it/s) [0s] - Splitting data set (22774/-, 78503 it/s) [0s] \n",
|
|
"\n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data splitted over 100 batches \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] End of the data splitting \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_0.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 0/100 : 108 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_1.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 1/100 : 105 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_10.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 2/100 : 83 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_11.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 3/100 : 100 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_12.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 4/100 : 81 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_13.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 5/100 : 322 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_14.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 6/100 : 1022 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_15.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 7/100 : 156 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_16.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 8/100 : 304 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_17.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 9/100 : 368 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_18.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 10/100 : 148 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_19.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 11/100 : 526 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_2.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 12/100 : 185 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_20.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 13/100 : 110 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_21.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 14/100 : 115 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_22.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 15/100 : 305 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_23.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 16/100 : 61 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_24.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 17/100 : 215 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_25.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 18/100 : 228 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_26.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 19/100 : 112 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_27.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 20/100 : 642 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_28.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 21/100 : 167 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_29.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 22/100 : 573 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_3.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 23/100 : 60 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_30.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 24/100 : 592 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_31.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 25/100 : 215 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_32.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 26/100 : 183 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_33.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 27/100 : 93 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_34.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 28/100 : 129 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_35.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 29/100 : 173 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_36.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 30/100 : 160 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_37.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 31/100 : 127 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_38.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 32/100 : 108 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_39.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 33/100 : 200 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_4.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 34/100 : 112 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_40.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 35/100 : 167 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_41.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 36/100 : 249 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_42.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 37/100 : 138 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_43.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 38/100 : 235 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_44.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 39/100 : 135 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_45.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 40/100 : 451 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_46.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 41/100 : 87 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_47.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 42/100 : 177 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_48.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 43/100 : 166 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_49.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 44/100 : 75 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_5.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 45/100 : 329 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_50.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 46/100 : 811 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_51.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 47/100 : 298 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_52.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 48/100 : 128 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_53.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 49/100 : 70 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_54.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 50/100 : 47 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_55.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 51/100 : 241 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_56.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 52/100 : 167 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_57.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 53/100 : 136 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_58.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 54/100 : 261 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_59.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 55/100 : 265 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_6.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 56/100 : 146 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_60.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 57/100 : 244 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_61.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 58/100 : 153 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_62.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 59/100 : 220 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_63.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 60/100 : 299 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_64.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 61/100 : 850 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_65.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 62/100 : 147 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_66.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 63/100 : 132 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_67.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 64/100 : 216 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_68.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 65/100 : 105 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_69.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 66/100 : 857 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_7.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 67/100 : 87 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_70.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 68/100 : 180 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_71.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 69/100 : 181 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_72.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 70/100 : 94 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_73.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 71/100 : 98 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_74.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 72/100 : 49 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_75.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 73/100 : 264 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_76.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 74/100 : 367 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_77.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 75/100 : 133 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_78.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 76/100 : 88 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_79.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 77/100 : 1445 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_8.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 78/100 : 1718 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_80.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 79/100 : 128 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_81.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 80/100 : 146 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_82.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 81/100 : 108 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_83.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 82/100 : 100 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_84.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 83/100 : 254 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_85.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 84/100 : 119 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_86.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 85/100 : 370 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_87.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 86/100 : 279 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_88.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 87/100 : 202 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_89.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 88/100 : 224 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_9.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 89/100 : 405 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_90.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 90/100 : 190 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_91.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 91/100 : 104 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_92.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 92/100 : 96 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_93.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 93/100 : 58 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_94.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 94/100 : 79 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_95.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 95/100 : 222 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_96.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start processing of batch 96/100 : 4158 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] /tmp/obiseq_chunks_488015107/chunk_97.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 97/100 : 129 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_488015107/chunk_98.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 98/100 : 183 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_488015107/chunk_99.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 99/100 : 348 sequences \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obiuniq -m taxid -Z \\\n",
|
|
" vert01_raw_db.fasta.gz \\\n",
|
|
" > vert01_uniq_db.fasta.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "826d4d84-f446-417d-a67f-79ee222194e7",
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"jupyter": {
|
|
"outputs_hidden": true
|
|
},
|
|
"scrolled": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0001] NCBI Taxdump Tar Archive detected: ../course/data/ncbitaxo_20251118.tgz \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Loading Taxonomy nodes \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] NCBI Taxdump Tar Archive detected: ../course/data/ncbitaxo_20251118.tgz \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Loading Taxonomy nodes \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] 2706727 Taxonomy nodes read \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] Loading Taxon names \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] 2706727 Taxonomy nodes read \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] Loading Taxon names \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] 2706727 taxon names read \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] Loading Merged taxa \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] 2706727 taxon names read \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] Loading Merged taxa \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] 93509 merged taxa read \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] Set as default taxonomy NCBI Taxonomy \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] vert01_uniq_db.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0038] Data is writen to stdout \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] GU981038_sub[458..556]: Taxid 150090 has to be updated to taxon:3371150 [Pseudosoriculus fumidus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] KY410960_sub[517..615]: Taxid 1620840 has to be updated to taxon:2740394 [Sundasciurus altitudinis]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] KX381710_sub[181..280]: Taxid 263451 has to be updated to taxon:3370180 [Lophostoma silvicola]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] JF694095_sub[447..545]: Taxid 164531 has to be updated to taxon:3370248 [Microperoryctes ornatus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] AY495457_sub[449..548]: Taxid 258864 has to be updated to taxon:3370479 [Ozimops planiceps]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] KY464180_sub[516..615]: Taxid 2021987 has to be updated to taxon:3370147 [Lenothrix cana]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] KY581660_sub[517..616]: Taxid 478698 has to be updated to taxon:3370289 [Mops plicatus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] AY495540_sub[446..543]: Taxid 258960 has to be updated to taxon:2720888 [Vespadelus darlingtoni]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] AJ430552_sub[441..540]: Taxid 410298 has to be updated to taxon:3370000 [Gerbilliscus afer]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] MF038679_sub[446..543]: Taxid 2093327 has to be updated to taxon:3369970 [Eptesicus dutertreus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] JX520511_sub[433..531]: Taxid 35665 has to be updated to taxon:3370058 [Heterogeomys hispidus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] MN807618_sub[513..611]: Taxid 119565 has to be updated to taxon:3371118 [Montemys delectorum]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] MK211397_sub[445..543]: Taxid 114227 has to be updated to taxon:3370452 [Onychogalea frenata]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] MF038680_sub[440..537]: Taxid 258906 has to be updated to taxon:3371012 [Cnephaeus hottentotus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] MZ708834_sub[527..626]: Taxid 224955 has to be updated to taxon:3370766 [Scaptonyx fusicauda]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] AJ389548_sub[455..553]: Taxid 35664 has to be updated to taxon:3370057 [Heterogeomys heterodus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] KX591624_sub[446..543]: Taxid 1898424 has to be updated to taxon:2778565 [Laephotis stanleyi]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] AF203773_sub[449..548]: Taxid 110940 has to be updated to taxon:3371137 [Paremballonura atrata]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] OR257582_sub[644..744]: Taxid 3055736 has to be updated to taxon:3370154 [Leopardus pardinoides]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] JQ039218_sub[446..543]: Taxid 568927 has to be updated to taxon:3369693 [Afronycteris nanus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0038] OM334923_sub[449..547]: Taxid 209424 has to be updated to taxon:3370523 [Petaurista grandis]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] JF694114_sub[450..548]: Taxid 1031278 has to be updated to taxon:3370506 [Perameles notina]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] KX381233_sub[178..277]: Taxid 40227 has to be updated to taxon:3369931 [Dermanura gnomus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] KX381261_sub[181..280]: Taxid 263451 has to be updated to taxon:3370180 [Lophostoma silvicola]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] KX754507_sub[450..549]: Taxid 224955 has to be updated to taxon:3370766 [Scaptonyx fusicauda]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] JN255346_sub[448..547]: Taxid 190653 has to be updated to taxon:3369760 [Cercartetus nana]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] AY530083_sub[459..558]: Taxid 264759 has to be updated to taxon:3370991 [Atopogale cubana]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] OP328298_sub[837..934]: Taxid 169057 has to be updated to taxon:3371009 [Cnephaeus bottae]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] HM561626_sub[498..595]: Taxid 59452 has to be updated to taxon:3371021 [Cnephaeus serotinus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] KP202255_sub[1281..1381]: Taxid 61412 has to be updated to taxon:3369756 [Caracal auratus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] AF263234_sub[447..543]: Taxid 124746 has to be updated to taxon:3371123 [Neoeptesicus furinalis]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] KR537889_sub[522..620]: Taxid 61085 has to be updated to taxon:3369802 [Crocidura beata]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] AY495466_sub[452..549]: Taxid 258906 has to be updated to taxon:3371012 [Cnephaeus hottentotus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] AY495514_sub[446..543]: Taxid 177184 has to be updated to taxon:2778567 [Pseudoromicia brunnea]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] KX754505_sub[452..551]: Taxid 224955 has to be updated to taxon:3370766 [Scaptonyx fusicauda]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] MH801935_sub[521..620]: Taxid 2588392 has to be updated to taxon:3370481 [Parablarinella griselda]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] AF442092_sub[447..546]: Taxid 263451 has to be updated to taxon:3370180 [Lophostoma silvicola]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] HM106321_sub[518..617]: Taxid 55048 has to be updated to taxon:3370370 [Neogale frenata]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] AY495467_sub[447..544]: Taxid 59452 has to be updated to taxon:3371021 [Cnephaeus serotinus]@species \n",
|
|
"\u001b[33mWARN\u001b[0m[0039] KX754504_sub[450..549]: Taxid 224955 has to be updated to taxon:3370766 [Scaptonyx fusicauda]@species \n",
|
|
"\n",
|
|
"\u001b[36mINFO\u001b[0m[0039] 93509 merged taxa read \n",
|
|
"\u001b[36mINFO\u001b[0m[0039] Set as default taxonomy NCBI Taxonomy \n",
|
|
"\u001b[36mINFO\u001b[0m[0039] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0039] Reading sequences from stdin in guessed \n",
|
|
"\u001b[36mINFO\u001b[0m[0039] - mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0039] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0039] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0039] Data is writen to stdout \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obiannotate --add-lca-in taxid \\\n",
|
|
" -t ../course/data/ncbitaxo_20251118.tgz \\\n",
|
|
" vert01_uniq_db.fasta.gz \\\n",
|
|
" | obiannotate -t ../course/data/ncbitaxo_20251118.tgz \\\n",
|
|
" --taxonomic-rank -Z \\\n",
|
|
" > vert01_lca_db.fasta.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "0e7d06ec-6477-4fdd-8213-295d5f1c21ba",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] vert01_lca_db.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"\n",
|
|
" 4 clade\n",
|
|
" 52 family\n",
|
|
" 374 genus\n",
|
|
" 6 no rank\n",
|
|
" 1 order\n",
|
|
" 3180 species\n",
|
|
" 1 species group\n",
|
|
" 46 subfamily\n",
|
|
" 6 subgenus\n",
|
|
" 1 suborder\n",
|
|
" 180 subspecies\n",
|
|
" 6 tribe\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicsv -k taxonomic_rank vert01_lca_db.fasta.gz \\\n",
|
|
" | tail -n +2 \\\n",
|
|
" | sort \\\n",
|
|
" | uniq -c"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "2b5a78c0-f1f8-4e6d-80f0-076cac6a19b5",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] NCBI Taxdump Tar Archive detected: ../course/data/ncbitaxo_20251118.tgz \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Loading Taxonomy nodes \n",
|
|
"\u001b[36mINFO\u001b[0m[0007] 2706727 Taxonomy nodes read \n",
|
|
"\u001b[36mINFO\u001b[0m[0007] Loading Taxon names \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] 2706727 taxon names read \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] Loading Merged taxa \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] 93509 merged taxa read \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] Set as default taxonomy NCBI Taxonomy \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] vert01_lca_db.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0016] Data is writen to stdout \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obigrep --require-rank family \\\n",
|
|
" -t ../course/data/ncbitaxo_20251118.tgz \\\n",
|
|
" --update-taxid \\\n",
|
|
" vert01_lca_db.fasta.gz \\\n",
|
|
" > vert01_clean_db.fasta.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "713eea58-4ce9-4444-a945-61d3e24aab63",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] vert01_clean_db.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"\n",
|
|
" 52 family\n",
|
|
" 374 genus\n",
|
|
" 6 no rank\n",
|
|
" 3180 species\n",
|
|
" 1 species group\n",
|
|
" 46 subfamily\n",
|
|
" 6 subgenus\n",
|
|
" 180 subspecies\n",
|
|
" 6 tribe\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicsv -k taxonomic_rank vert01_clean_db.fasta.gz \\\n",
|
|
" | tail -n +2 \\\n",
|
|
" | sort \\\n",
|
|
" | uniq -c"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"id": "374322c4-8ce0-46a2-b5af-aa6900ee8735",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] vert01_clean_db.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"\n",
|
|
" 21 family\n",
|
|
" 188 genus\n",
|
|
" 3 no rank\n",
|
|
" 2021 species\n",
|
|
" 1 species group\n",
|
|
" 23 subfamily\n",
|
|
" 3 subgenus\n",
|
|
" 98 subspecies\n",
|
|
" 2 tribe\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicsv -k taxid vert01_clean_db.fasta.gz \\\n",
|
|
" | tail -n +2 \\\n",
|
|
" | sort \\\n",
|
|
" | uniq \\\n",
|
|
" | awk -F'@' '{print $2}' \\\n",
|
|
" | sort \\\n",
|
|
" | uniq -c"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "4e1cd2a1-e0bc-4301-8b60-69bba99ff384",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] NCBI Taxdump Tar Archive detected: ../course/data/ncbitaxo_20251118.tgz \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Loading Taxonomy nodes \n",
|
|
"\u001b[36mINFO\u001b[0m[0007] 2706727 Taxonomy nodes read \n",
|
|
"\u001b[36mINFO\u001b[0m[0007] Loading Taxon names \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] 2706727 taxon names read \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] Loading Merged taxa \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] 93509 merged taxa read \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] Set as default taxonomy NCBI Taxonomy \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] vert01_clean_db.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0017] Data is writen to stdout \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obiannotate --taxonomic-path \\\n",
|
|
" -t ../course/data/ncbitaxo_20251118.tgz \\\n",
|
|
" -Z \\\n",
|
|
" vert01_clean_db.fasta.gz \\\n",
|
|
" > vert01_ref_db.fasta.gz "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "abf03b79-7a61-4d70-a212-7f7ee7e2319e",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Analyzing the metabarcoding data\n",
|
|
"\n",
|
|
"### Step 1: Pairing the Reads\n",
|
|
"\n",
|
|
"We'll use the `obipairing` *OBITools4* command."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 10,
|
|
"id": "ee953292-a9ed-4a2e-bec3-2482dbf0f120",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] ../course/data/Wolf_diet/wolf_F.fastq mime type: text/fastq \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] ../course/data/Wolf_diet/wolf_R.fastq mime type: text/fastq \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Start of the sequence Pairing using 16 workers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Initializing the DNA Scoring matrix \n",
|
|
" - Reading sequences (15525/-, 59229 it/s) [0s] | Reading sequences (19407/-, 43953 it/s) [0s] | Reading sequences (27170/-, 60156 it/s) [0s] / Reading sequences (23288/-, 42042 it/s) [0s] - Reading sequences (31052/-, 47481 it/s) [0s] | Reading sequences (38815/-, 47481 it/s) [0s] \\ Reading sequences (31052/-, 27527 it/s) [1s] - Reading sequences (38815/-, 27527 it/s) [1s] - Reading sequences (41393/-, 30190 it/s) [1s] \\ Reading sequences (41393/-, 27527 it/s) [1s] \n",
|
|
" | Reading sequences (45276/-, 30190 it/s) [1s] \n",
|
|
"\u001b[36mINFO\u001b[0m[0003] End of the sequence Pairing \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obipairing -F ../course/data/Wolf_diet/wolf_F.fastq \\\n",
|
|
" -R ../course/data/Wolf_diet/wolf_R.fastq \\\n",
|
|
" -Z \\\n",
|
|
" > wolf_paired.fastq.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"id": "1185355d-aaf6-4d48-b2dc-340c500799c6",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_paired.fastq.gz mime type: text/fastq \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] mode alignment \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"/ Reading sequences (5073/-, 50093 it/s) [0s] \u001b[36mINFO\u001b[0m[0000] - mime type: text/fastq \n",
|
|
" Reading sequences (13538/-, 46268 it/s) [0s] | Reading sequences (15231/-, 37250 it/s) [0s] / Reading sequences (18616/-, 31567 it/s) [0s] | Reading sequences (3344/-, 8180 it/s) [0s] \\ Reading sequences (25387/-, 31567 it/s) [0s] - Writing CSV (6692/-, 11124 it/s) [0s] | Reading sequences (27086/-, 31567 it/s) [0s] / Reading sequences (30462/-, 31567 it/s) [0s] | Reading sequences (13397/-, 11125 it/s) [0s] | Writing CSV (13397/-, 11124 it/s) [0s] - Reading sequences (33849/-, 30908 it/s) [1s] \\ Reading sequences (15071/-, 13860 it/s) [1s] \\ Writing CSV (15071/-, 13859 it/s) [1s] | Reading sequences (20101/-, 13860 it/s) [1s] | Writing CSV (20101/-, 13859 it/s) [1s] - Reading sequences (41896/-, 30908 it/s) [1s] \n",
|
|
" / Reading sequences (23458/-, 13860 it/s) [1s] / Writing CSV (23458/-, 13859 it/s) [1s] \\ Reading sequences (26804/-, 13860 it/s) [1s] \\ Writing CSV (26804/-, 13859 it/s) [1s] | Reading sequences (34097/-, 13860 it/s) [1s] | Writing CSV (34097/-, 13859 it/s) [1s] \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obigrep -a mode=alignment wolf_paired.fastq.gz \\\n",
|
|
" | obicsv -k ali_length -k score_norm \\\n",
|
|
" > wolf_paired_scores.csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"id": "20fc7a51-1d14-4a2d-a50c-377c28303b86",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_paired.fastq.gz mime type: text/fastq \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] mode alignment \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"/ Reading sequences (3378/-, 17818 it/s) [0s] \u001b[36mINFO\u001b[0m[0000] - mime type: text/fastq \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
" Reading sequences (8457/-, 28787 it/s) [0s] | Reading sequences (13538/-, 27278 it/s) [0s] - Reading sequences (15228/-, 22019 it/s) [0s] / Reading sequences (3348/-, 6690 it/s) [0s] | Reading sequences (18614/-, 22019 it/s) [0s] - Reading sequences (5021/-, 6690 it/s) [0s] - Reading sequences (23692/-, 22019 it/s) [1s] | Reading sequences (27080/-, 21165 it/s) [1s] / Reading sequences (28768/-, 21165 it/s) [1s] \\ Reading sequences (10044/-, 8925 it/s) [1s] - Reading sequences (35544/-, 21165 it/s) [1s] | Reading sequences (16749/-, 8925 it/s) [1s] | Reading sequences (37232/-, 21165 it/s) [1s] - Reading sequences (18427/-, 8925 it/s) [1s] / Reading sequences (42318/-, 24145 it/s) [1s] \\ Reading sequences (43585/-, 24145 it/s) [1s] | Reading sequences (20101/-, 11693 it/s) [1s] \n",
|
|
" / Reading sequences (23458/-, 11693 it/s) [1s] - Reading sequences (26803/-, 11693 it/s) [1s] \\ Reading sequences (31830/-, 11693 it/s) [1s] \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obigrep -a mode=alignment wolf_paired.fastq.gz \\\n",
|
|
" | obigrep -p 'annotations.score_norm >= 0.96 && \n",
|
|
" annotations.ali_length > 55 && \n",
|
|
" annotations.ali_length < 65' \\\n",
|
|
" -Z \\\n",
|
|
" > wolf_paired_good.fastq.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"id": "bfa5e59a-572e-4386-b948-83ab36b70c31",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_paired.fastq.gz mime type: text/fastq \n",
|
|
" Reading sequences (16921/-, 47077 it/s) [0s] | Reading sequences (23692/-, 51190 it/s) [0s] / Reading sequences (32155/-, 56006 it/s) [0s] - Reading sequences (40629/-, 56006 it/s) [0s] \\ Reading sequences (42321/-, 56006 it/s) [0s] \n",
|
|
"| entities | n |\n",
|
|
"| -------- | --------- |\n",
|
|
"| variants | 45,276 |\n",
|
|
"| reads | 45,276 |\n",
|
|
"| symbols | 7,386,937 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicount wolf_paired.fastq.gz | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 14,
|
|
"id": "849dfb1d-25b4-47bf-a99e-f2b19a2e1344",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_paired_good.fastq.gz mime type: text/fastq \n",
|
|
" Reading sequences (13765/-, 54484 it/s) [0s] \\ Reading sequences (26237/-, 74257 it/s) [0s] \n",
|
|
"| entities | n |\n",
|
|
"| -------- | --------- |\n",
|
|
"| variants | 27,955 |\n",
|
|
"| reads | 27,955 |\n",
|
|
"| symbols | 4,302,597 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicount wolf_paired_good.fastq.gz | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "f776e779-3cbc-46c5-a609-46ac9d158550",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Extracting the barcode\n",
|
|
"\n",
|
|
"Using the `obimultiplex`command. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"id": "b60a82b7-7d3c-48c7-bf6f-71a01f5a20d9",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_paired_good.fastq.gz mime type: text/fastq \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Reading NGSFilter file: ../course/data/Wolf_diet/wolf_data_wolf_diet_ngsfilter.csv \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] No BOM detected \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] NGSFilter configuration mimetype: text/ngsfilter-csv \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] 3 parameters found \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Read 5 records \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] First record: [experiment sample sample_tag forward_primer reverse_primer] \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Set tag matching mode to strict \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Set global allowed primer mismatches to 2 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Disallows indels for primer matching \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Unassigned sequences saved in file: wolf_unassign.fastq.gz \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Sequence demultiplexing using 16 workers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
" Reading sequences (10324/-, 42547 it/s) [0s] | Reading sequences (12045/-, 27315 it/s) [0s] \\ Reading sequences (17208/-, 22941 it/s) [0s] - Reading sequences (22802/-, 22941 it/s) [1s] \\ Reading sequences (27955/-, 22941 it/s) [1s] \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obimultiplex -s ../course/data/Wolf_diet/wolf_data_wolf_diet_ngsfilter.csv \\\n",
|
|
" -u wolf_unassign.fastq.gz \\\n",
|
|
" -Z \\\n",
|
|
" wolf_paired_good.fastq.gz \\\n",
|
|
" > wolf_assign.fastq.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"id": "9e7e47ba-dedd-4cc5-9952-a6c3d94d18f2",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_unassign.fastq.gz mime type: text/fastq \n",
|
|
"\n",
|
|
"| entities | n |\n",
|
|
"| -------- | ------ |\n",
|
|
"| variants | 243 |\n",
|
|
"| reads | 243 |\n",
|
|
"| symbols | 23,598 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicount wolf_unassign.fastq.gz | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 17,
|
|
"id": "9942e6fe-83a1-4bb9-acea-f9479cae1930",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_assign.fastq.gz mime type: text/fastq \n",
|
|
" eading sequences (4415/-, 14719 it/s) [0s] / Reading sequences (8829/-, 17607 it/s) [0s] - Reading sequences (11478/-, 17607 it/s) [0s] \\ Reading sequences (14125/-, 17607 it/s) [0s] / Reading sequences (16774/-, 17607 it/s) [0s] - Reading sequences (19425/-, 19283 it/s) [1s] \\ Reading sequences (23836/-, 19283 it/s) [1s] \n",
|
|
"| entities | n |\n",
|
|
"| -------- | --------- |\n",
|
|
"| variants | 27,712 |\n",
|
|
"| reads | 27,712 |\n",
|
|
"| symbols | 2,580,858 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicount wolf_assign.fastq.gz | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "ca0c2bb5-b457-4787-9f81-c9a892e9cd64",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_assign.fastq.gz mime type: text/fastq \n",
|
|
" Reading sequences (4416/-, 18923 it/s) [0s] - Writing CSV (4416/-, 18918 it/s) [0s] | Reading sequences (8831/-, 20349 it/s) [0s] | Writing CSV (8831/-, 20347 it/s) [0s] / Reading sequences (11477/-, 21376 it/s) [0s] / Writing CSV (11477/-, 21374 it/s) [0s] \\ Reading sequences (14125/-, 21376 it/s) [0s] \\ Writing CSV (14125/-, 21374 it/s) [0s] | Reading sequences (15008/-, 21376 it/s) [0s] | Writing CSV (15008/-, 21374 it/s) [0s] - Reading sequences (19425/-, 21376 it/s) [1s] - Writing CSV (19425/-, 21374 it/s) [1s] \\ Reading sequences (20308/-, 18061 it/s) [1s] \\ Writing CSV (20308/-, 18058 it/s) [1s] / Reading sequences (23836/-, 18061 it/s) [1s] / Writing CSV (23836/-, 18058 it/s) [1s] \n",
|
|
"\n",
|
|
" 6447 13a_F730603\n",
|
|
" 6066 15a_F730814\n",
|
|
" 9567 26a_F040644\n",
|
|
" 5632 29a_F260619\n",
|
|
" 1 sample\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicsv -k sample wolf_assign.fastq.gz \\\n",
|
|
" | tail -n +2 \\\n",
|
|
" | sort \\\n",
|
|
" | uniq -c"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "e1b06db5-099d-4540-a781-0dbfa797d9b6",
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"jupyter": {
|
|
"outputs_hidden": true
|
|
},
|
|
"scrolled": true
|
|
},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_assign.fastq.gz mime type: text/fastq \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Running dereplication on disk with 100 chunks \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Keep sigletons in the output \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Starting data splitting \n",
|
|
" Reading sequences (4415/-, 16703 it/s) [0s] \\ Splitting data set (4415/-, 12929 it/s) [0s] | Reading sequences (6179/-, 14028 it/s) [0s] | Splitting data set (7062/-, 15883 it/s) [0s] / Reading sequences (8829/-, 16246 it/s) [0s] - Splitting data set (8829/-, 13801 it/s) [0s] - Reading sequences (10595/-, 16246 it/s) [0s] \\ Splitting data set (12361/-, 13801 it/s) [0s] \\ Reading sequences (15008/-, 16246 it/s) [0s] | Splitting data set (15008/-, 13801 it/s) [0s] | Reading sequences (15893/-, 16246 it/s) [0s] - Splitting data set (18542/-, 13801 it/s) [1s] - Reading sequences (19425/-, 18704 it/s) [1s] \\ Splitting data set (20308/-, 17161 it/s) [1s] \\ Reading sequences (21191/-, 18704 it/s) [1s] / Splitting data set (25604/-, 17161 it/s) [1s] / Reading sequences (26484/-, 18704 it/s) [1s] \n",
|
|
"\n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Data splitted over 100 batches \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] End of the data splitting \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Data is writen to stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_0.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 0/100 : 37 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_1.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 1/100 : 14 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_10.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 2/100 : 13 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_11.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 3/100 : 13 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_12.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 4/100 : 355 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_13.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 5/100 : 21 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_14.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 6/100 : 18 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_15.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 7/100 : 38 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_16.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 8/100 : 17 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_17.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 9/100 : 11 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_18.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 10/100 : 7 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_19.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 11/100 : 20 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_2.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 12/100 : 25 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_20.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 13/100 : 7756 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_21.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 14/100 : 43 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_22.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 15/100 : 27 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_23.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 16/100 : 10 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_24.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 17/100 : 23 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_25.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 18/100 : 15 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_26.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 19/100 : 16 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_27.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 20/100 : 46 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_28.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 21/100 : 31 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_29.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 22/100 : 15 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_3.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 23/100 : 48 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_30.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 24/100 : 46 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_31.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 25/100 : 63 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_32.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 26/100 : 31 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_33.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 27/100 : 16 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_34.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 28/100 : 38 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_35.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 29/100 : 42 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_36.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 30/100 : 163 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_37.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 31/100 : 15 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_38.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 32/100 : 40 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_39.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] Start processing of batch 33/100 : 24 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0001] /tmp/obiseq_chunks_2438688034/chunk_4.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 34/100 : 5825 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_40.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 35/100 : 20 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_41.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 36/100 : 23 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_42.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 37/100 : 97 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_43.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 38/100 : 27 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_44.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 39/100 : 1863 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_45.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 40/100 : 42 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_46.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 41/100 : 40 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_47.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 42/100 : 65 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_48.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 43/100 : 39 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_49.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 44/100 : 43 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_5.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 45/100 : 38 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_50.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 46/100 : 72 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_51.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 47/100 : 12 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_52.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 48/100 : 15 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_53.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 49/100 : 75 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_54.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 50/100 : 42 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_55.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 51/100 : 39 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_56.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 52/100 : 30 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_57.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 53/100 : 28 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_58.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 54/100 : 40 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_59.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 55/100 : 32 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_6.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 56/100 : 35 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_60.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 57/100 : 35 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_61.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 58/100 : 8405 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_62.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 59/100 : 29 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_63.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 60/100 : 25 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_64.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 61/100 : 79 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_65.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 62/100 : 27 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_66.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 63/100 : 25 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_67.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 64/100 : 21 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_68.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 65/100 : 23 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_69.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 66/100 : 21 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_7.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 67/100 : 48 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_70.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 68/100 : 15 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_71.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 69/100 : 26 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_72.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 70/100 : 29 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_73.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 71/100 : 13 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_74.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 72/100 : 26 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_75.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 73/100 : 15 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_76.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 74/100 : 15 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_77.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 75/100 : 21 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_78.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 76/100 : 57 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_79.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 77/100 : 18 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_8.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 78/100 : 21 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_80.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 79/100 : 17 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_81.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 80/100 : 15 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_82.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 81/100 : 55 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_83.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 82/100 : 30 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_84.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 83/100 : 47 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_85.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 84/100 : 37 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_86.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 85/100 : 10 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_87.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 86/100 : 21 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_88.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 87/100 : 26 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_89.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 88/100 : 9 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_9.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 89/100 : 47 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_90.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 90/100 : 42 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_91.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 91/100 : 13 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_92.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 92/100 : 36 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_93.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 93/100 : 26 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_94.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 94/100 : 235 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_95.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 95/100 : 29 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_96.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 96/100 : 304 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_97.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 97/100 : 29 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_98.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 98/100 : 18 sequences \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] /tmp/obiseq_chunks_2438688034/chunk_99.fastx mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0002] Start processing of batch 99/100 : 33 sequences \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obiuniq -m sample -Z \\\n",
|
|
" wolf_assign.fastq.gz \\\n",
|
|
" > wolf_uniq.fasta.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "aeaa4c26-19b7-4485-b5bd-d4d450a6d1ac",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_uniq.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"| entities | n |\n",
|
|
"| -------- | ------ |\n",
|
|
"| variants | 989 |\n",
|
|
"| reads | 27,712 |\n",
|
|
"| symbols | 97,427 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicount wolf_uniq.fasta.gz | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "d00e1c45-c378-4899-af45-0969d62a9fdd",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Dataset cleaning\n",
|
|
"\n",
|
|
"First step: looking at singleton sequences"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"id": "cbd7039e-ce10-41f2-888c-ce09a886ab13",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_uniq.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"\n",
|
|
" 1 count\n",
|
|
" 543 1\n",
|
|
" 125 2\n",
|
|
" 86 3\n",
|
|
" 66 4\n",
|
|
" 36 5\n",
|
|
" 14 6\n",
|
|
" 36 7\n",
|
|
" 19 8\n",
|
|
" 10 9\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicsv -k count wolf_uniq.fasta.gz \\\n",
|
|
" | sort -n \\\n",
|
|
" | uniq -c \\\n",
|
|
" | head -n 10"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "b296bbb9-d187-4485-9b4c-b1df5c9e3bca",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_uniq.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obigrep -c 2 -Z \\\n",
|
|
" wolf_uniq.fasta.gz \\\n",
|
|
" > wolf_nosingleton.fasta.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 25,
|
|
"id": "a61b07fb-3215-435c-917c-cff21453b056",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_nosingleton.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"| entities | n |\n",
|
|
"| -------- | ------ |\n",
|
|
"| variants | 446 |\n",
|
|
"| reads | 27,169 |\n",
|
|
"| symbols | 43,760 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicount wolf_nosingleton.fasta.gz | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2a496d5a-afb5-4be8-97e7-21e7fefcd6ff",
|
|
"metadata": {},
|
|
"source": [
|
|
"Second step: Look at the sequence length distribution."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"id": "bab297cb-33c9-4488-8100-ca65fe6e9187",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_nosingleton.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed \n",
|
|
"\n",
|
|
"\u001b[36mINFO\u001b[0m[0000] - mime type: text/fasta \n",
|
|
"\n",
|
|
"\n",
|
|
" 1 seq_length\n",
|
|
" 5 4\n",
|
|
" 1 5\n",
|
|
" 1 8\n",
|
|
" 179 99\n",
|
|
" 259 100\n",
|
|
" 1 106\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obiannotate --length \\\n",
|
|
" wolf_nosingleton.fasta.gz \\\n",
|
|
"| obicsv -k seq_length \\\n",
|
|
"| sort -n \\\n",
|
|
"| uniq -c "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 27,
|
|
"id": "3966fc20-6629-4097-bde3-c3027d744ab1",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_nosingleton.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obigrep -l 50 -Z \\\n",
|
|
" wolf_nosingleton.fasta.gz \\\n",
|
|
" > wolf_noshort.fasta.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 29,
|
|
"id": "1865ca6b-0dd8-43d5-85c2-239d9e09327a",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_noshort.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"| entities | n |\n",
|
|
"| -------- | ------ |\n",
|
|
"| variants | 439 |\n",
|
|
"| reads | 25,290 |\n",
|
|
"| symbols | 43,727 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicount wolf_noshort.fasta.gz | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "396cb1b7-4816-4fb9-be57-b1654c8d51c1",
|
|
"metadata": {},
|
|
"source": [
|
|
"Step 3: Look at ambiguous nucleotides. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 30,
|
|
"id": "88d9da85-2eb3-4ed8-aa7c-a6830aa946fe",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_noshort.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\n",
|
|
"\u001b[36mINFO\u001b[0m[0000] - mime type: text/fasta \n",
|
|
"\n",
|
|
"| entities | n |\n",
|
|
"| -------- | --- |\n",
|
|
"| variants | 4 |\n",
|
|
"| reads | 10 |\n",
|
|
"| symbols | 399 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obigrep -v -s '^[acgt]+$' \\\n",
|
|
" wolf_noshort.fasta.gz \\\n",
|
|
"| obicount | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 31,
|
|
"id": "c1d3c092-0af3-404a-a880-028dfdc99a87",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_noshort.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obigrep -Z -s '^[acgt]+$' \\\n",
|
|
" wolf_noshort.fasta.gz \\\n",
|
|
" > wolf_acgt.fasta.gz"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"id": "267a0727-ea1f-4933-8615-5b1f98b93e5c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_acgt.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"| entities | n |\n",
|
|
"| -------- | ------ |\n",
|
|
"| variants | 435 |\n",
|
|
"| reads | 25,280 |\n",
|
|
"| symbols | 43,328 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicount wolf_acgt.fasta.gz | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "dbfcc481-ca4d-485e-ac1f-023804a60b11",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Running obiclean\n",
|
|
"\n",
|
|
"#### Evaluating the ration threshold"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"id": "7935c09f-dcf4-4d73-a7a0-3014171961a8",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_acgt.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Sequence dataset of 435 sequeences loaded \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Dataset composed of 4 samples \n",
|
|
" ror graph] 44% |██████ | (18389952 it/s) [0s:0s][One error graph] 59% |████████ | (17964160 it/s) [0s:0s][One error graph] 100% |███████████████| (19531142 it/s)[Annotate sequence status] 50% |███████ | (786 it/s) [0s:0s][Annotate sequence status] 75% |███████████ | (1141 it/s) [0s:0s][Annotate sequence status] 100% |███████████████| (1467 it/s)[Save GML Graph files] 50% |███████ | (1060 it/s) [0s:0s][Save GML Graph files] 75% |███████████ | (1202 it/s) [0s:0s][Save GML Graph files] 100% |███████████████| (1212 it/s)[Save CSV stat ratio file] 8% |█ | (81360 it/s) [0s:0s][Save CSV stat ratio file] 12% |█ | (74772 it/s) [0s:0s][Save CSV stat ratio file] 16% |██ | (73066 it/s) [0s:0s][Save CSV stat ratio file] 20% |███ | (72208 it/s) [0s:0s][Save CSV stat ratio file] 24% |███ | (70560 it/s) [0s:0s][Save CSV stat ratio file] 28% |████ | (71283 it/s) [0s:0s][Save CSV stat ratio file] 32% |████ | (66951 it/s) [0s:0s][Save CSV stat ratio file] 36% |█████ | (30132 it/s) [0s:0s][Save CSV stat ratio file] 40% |██████ | (30654 it/s) [0s:0s][Save CSV stat ratio file] 44% |██████ | (31945 it/s) [0s:0s][Save CSV stat ratio file] 48% |███████ | (33099 it/s) [0s:0s][Save CSV stat ratio file] 52% |███████ | (34555 it/s) [0s:0s][Save CSV stat ratio file] 56% |████████ | (35574 it/s) [0s:0s][Save CSV stat ratio file] 60% |█████████ | (34914 it/s) [0s:0s][Save CSV stat ratio file] 64% |█████████ | (36083 it/s) [0s:0s][Save CSV stat ratio file] 68% |██████████ | (36119 it/s) [0s:0s][Save CSV stat ratio file] 72% |██████████ | (36945 it/s) [0s:0s][Save CSV stat ratio file] 76% |███████████ | (37940 it/s) [0s:0s][Save CSV stat ratio file] 80% |████████████ | (38694 it/s) [0s:0s][Save CSV stat ratio file] 84% |████████████ | (39511 it/s) [0s:0s][Save CSV stat ratio file] 88% |█████████████ | (39952 it/s) [0s:0s][Save CSV stat ratio file] 92% |█████████████ | (38738 it/s) [0s:0s][Save CSV stat ratio file] 96% |██████████████ | (38713 it/s) [0s:0s][Save CSV stat ratio file] 100% |███████████████| (39438 it/s)\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obiclean --save-ratio wolf_ratio.csv \\\n",
|
|
" --save-graph wolf_graph \\\n",
|
|
" wolf_acgt.fasta.gz \\\n",
|
|
" > wolf_obiclean_1.fasta"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 38,
|
|
"id": "eeabe3a8-7069-469b-bc3c-5a0faef90fd7",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_acgt.fasta.gz mime type: text/fasta \n",
|
|
"\n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Sequence dataset of 435 sequeences loaded \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Dataset composed of 4 samples \n",
|
|
" ph] 44% |██████ | (11275210 it/s) [0s:0s][One error graph] 59% |████████ | (10893174 it/s) [0s:0s][One error graph] 100% |███████████████| (12325209 it/s)[Filter graph on abundance ratio] 50% |███████ | (54237 it/s) [0s:0s][Filter graph on abundance ratio] 75% |███████████ | (47337 it/s) [0s:0s][Filter graph on abundance ratio] 100% |███████████████| (44017 it/s)[Annotate sequence status] 50% |███████ | (18476 it/s) [0s:0s][Annotate sequence status] 75% |███████████ | (15161 it/s) [0s:0s][Annotate sequence status] 100% |███████████████| (3483 it/s)\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obiclean --detect-chimera \\\n",
|
|
" -r 0.1 -H \\\n",
|
|
" wolf_acgt.fasta.gz \\\n",
|
|
" > wolf_obiclean_2.fasta "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 39,
|
|
"id": "3742381b-c1c8-4e6e-8400-741d3c54e4a9",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_obiclean_2.fasta mime type: text/fasta \n",
|
|
"\n",
|
|
"| entities | n |\n",
|
|
"| -------- | ------ |\n",
|
|
"| variants | 30 |\n",
|
|
"| reads | 22,608 |\n",
|
|
"| symbols | 2,987 |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicount wolf_obiclean_2.fasta | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "0ee6152c-d6cf-4e53-91f9-3cab814de1dd",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Taxonomical assignment\n",
|
|
"\n",
|
|
"using `obitag`"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 41,
|
|
"id": "56245e8e-2eda-43d9-bfcd-031334780c74",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 32 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_obiclean_2.fasta mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] ./vert01_ref_db.fasta.gz mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Set as default taxonomy taxon \n",
|
|
"/ Reading sequences (1/-, 5 it/s) [0s] \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] 3851 reference sequences conserved on 3851 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obitag -R ./vert01_ref_db.fasta.gz \\\n",
|
|
" --save-db ./vert01_ref_db_indexed.fasta \\\n",
|
|
" wolf_obiclean_2.fasta \\\n",
|
|
" > wolf_taxon.fasta"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 45,
|
|
"id": "4b0663f1-cecb-4477-9d65-59d964db3e6d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Reading sequences from stdin in guessed \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_taxon.fasta mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\n",
|
|
"\u001b[36mINFO\u001b[0m[0000] - mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obiannotate --number wolf_taxon.fasta \\\n",
|
|
"| obiannotat e --set-identifier 'sprintf(\"MOTU_%03d\", annotations.seq_number)' \\\n",
|
|
" > wolf_short_id.fasta"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 48,
|
|
"id": "4a658deb-105b-4d80-88c4-243af836f577",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_short_id.fasta mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obigrep -p 'max(annotations.obiclean_weight) >= 100' \\\n",
|
|
" wolf_short_id.fasta \\\n",
|
|
" > wolf_no_rare.fasta"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"id": "7d59699a-758f-485b-81df-c73388ffafe7",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_no_rare.fasta mime type: text/fasta \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] On output use JSON headers \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Output is done on stdout \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Data is writen to stdout \n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obiannotate -k count \\\n",
|
|
" -k merged_sample \\\n",
|
|
" -k obiclean_weight \\\n",
|
|
" -k obitag_bestmatch \\\n",
|
|
" -k obitag_bestid \\\n",
|
|
" -k obitag_rank \\\n",
|
|
" -k taxid \\\n",
|
|
" wolf_no_rare.fasta \\\n",
|
|
" > wolf_taxon_cleaned.fasta"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 50,
|
|
"id": "dc7a771d-6942-49cc-9976-ff198b4df3ea",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_taxon_cleaned.fasta mime type: text/fasta \n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obicsv -i -s --auto \\\n",
|
|
" wolf_taxon_cleaned.fasta \\\n",
|
|
" > wolf_taxon_cleaned.csv"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 51,
|
|
"id": "b1605913-f581-4120-a42a-e2a181ea7aa7",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\u001b[36mINFO\u001b[0m[0000] Number of workers set 16 \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] Found 1 files to process \n",
|
|
"\u001b[36mINFO\u001b[0m[0000] wolf_no_rare.fasta mime type: text/fasta \n",
|
|
"\n",
|
|
"| taxon | id | obitag_bestid | 13a_F730603 | 15a_F730814 | 26a_F040644 | 29a_F260619 |\n",
|
|
"| ---------------------------------------- | -------- | ------------- | ----------- | ----------- | ----------- | ----------- |\n",
|
|
"| taxon:9611 [Canis]@genus | MOTU_066 | 1.000… | 9 | 4 | 328 | 1 |\n",
|
|
"| taxon:9992 [Marmota]@genus | MOTU_006 | 0.990… | 0 | 0 | 8,744 | 0 |\n",
|
|
"| taxon:35500 [Pecora]@infraorder | MOTU_014 | 0.950… | 0 | 0 | 0 | 152 |\n",
|
|
"| taxon:9860 [Cervus elaphus]@species | MOTU_017 | 1.000… | 6,192 | 0 | 0 | 0 |\n",
|
|
"| taxon:55153 [Sciuridae]@family | MOTU_020 | 0.949… | 0 | 0 | 146 | 0 |\n",
|
|
"| taxon:9858 [Capreolus capreolus]@species | MOTU_039 | 1.000… | 0 | 5,975 | 0 | 3,404 |\n",
|
|
"| | | | | | | |\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"obimatrix --transpose \\\n",
|
|
" -k id \\\n",
|
|
" -k taxid \\\n",
|
|
" -k obitag_bestid \\\n",
|
|
" --map obiclean_weight \\\n",
|
|
" wolf_no_rare.fasta | csvlook"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "db64f7f0-c891-488d-a724-66644c2348f7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Bash",
|
|
"language": "bash",
|
|
"name": "bash"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": "shell",
|
|
"file_extension": ".sh",
|
|
"mimetype": "text/x-sh",
|
|
"name": "bash"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|