{ "cells": [ { "cell_type": "markdown", "id": "satisfied-shelf", "metadata": {}, "source": [ "# The Art of Life\n", "\n", "Nature is remarkably complex and offers a plethora of intricate patterns to those who dare to investigate.\n", "These patterns can appear in many forms. Here, we will take a look at how all (sequenced) organisms relate to each other when \n", "projecting their high-dimensional genome space down to two dimensions." ] }, { "cell_type": "code", "execution_count": 1, "id": "bearing-association", "metadata": { "execution": { "iopub.execute_input": "2022-01-05T21:49:14.074889Z", "iopub.status.busy": "2022-01-05T21:49:14.074072Z", "iopub.status.idle": "2022-01-05T21:51:30.989737Z", "shell.execute_reply": "2022-01-05T21:51:30.988738Z" } }, "outputs": [], "source": [ "import os\n", "import gzip\n", "import json\n", "import shutil\n", "import functools\n", "from pathlib import Path\n", "import multiprocessing as mp\n", "from concurrent.futures import Future, as_completed, ProcessPoolExecutor\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from matplotlib.patches import Patch\n", "\n", "import requests\n", "from requests_futures.sessions import FuturesSession\n", "\n", "import khmer\n", "import metagenompy\n", "from Bio import SeqIO\n", "\n", "import umap\n", "import umap.plot\n", "\n", "import joblib\n", "from tqdm.auto import tqdm" ] }, { "cell_type": "code", "execution_count": 2, "id": "hungarian-baseline", "metadata": { "execution": { "iopub.execute_input": "2022-01-05T21:51:31.006200Z", "iopub.status.busy": "2022-01-05T21:51:31.005423Z", "iopub.status.idle": "2022-01-05T21:51:31.024710Z", "shell.execute_reply": "2022-01-05T21:51:31.025400Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/javascript": [ "\n", "(function(root) {\n", " function now() {\n", " return new Date();\n", " }\n", "\n", " const force = true;\n", "\n", " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", " root._bokeh_onload_callbacks = [];\n", " root._bokeh_is_loading = undefined;\n", " }\n", "\n", " const JS_MIME_TYPE = 'application/javascript';\n", " const HTML_MIME_TYPE = 'text/html';\n", " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", " const CLASS_NAME = 'output_bokeh rendered_html';\n", "\n", " /**\n", " * Render data to the DOM node\n", " */\n", " function render(props, node) {\n", " const script = document.createElement(\"script\");\n", " node.appendChild(script);\n", " }\n", "\n", " /**\n", " * Handle when an output is cleared or removed\n", " */\n", " function handleClearOutput(event, handle) {\n", " const cell = handle.cell;\n", "\n", " const id = cell.output_area._bokeh_element_id;\n", " const server_id = cell.output_area._bokeh_server_id;\n", " // Clean up Bokeh references\n", " if (id != null && id in Bokeh.index) {\n", " Bokeh.index[id].model.document.clear();\n", " delete Bokeh.index[id];\n", " }\n", "\n", " if (server_id !== undefined) {\n", " // Clean up Bokeh references\n", " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", " cell.notebook.kernel.execute(cmd_clean, {\n", " iopub: {\n", " output: function(msg) {\n", " const id = msg.content.text.trim();\n", " if (id in Bokeh.index) {\n", " Bokeh.index[id].model.document.clear();\n", " delete Bokeh.index[id];\n", " }\n", " }\n", " }\n", " });\n", " // Destroy server and session\n", " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", " cell.notebook.kernel.execute(cmd_destroy);\n", " }\n", " }\n", "\n", " /**\n", " * Handle when a new output is added\n", " */\n", " function handleAddOutput(event, handle) {\n", " const output_area = handle.output_area;\n", " const output = handle.output;\n", "\n", " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", " return\n", " }\n", "\n", " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", "\n", " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", " // store reference to embed id on output_area\n", " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", " }\n", " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", " const bk_div = document.createElement(\"div\");\n", " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", " const script_attrs = bk_div.children[0].attributes;\n", " for (let i = 0; i < script_attrs.length; i++) {\n", " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", " }\n", " // store reference to server id on output_area\n", " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", " }\n", " }\n", "\n", " function register_renderer(events, OutputArea) {\n", "\n", " function append_mime(data, metadata, element) {\n", " // create a DOM node to render to\n", " const toinsert = this.create_output_subarea(\n", " metadata,\n", " CLASS_NAME,\n", " EXEC_MIME_TYPE\n", " );\n", " this.keyboard_manager.register_events(toinsert);\n", " // Render to node\n", " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", " render(props, toinsert[toinsert.length - 1]);\n", " element.append(toinsert);\n", " return toinsert\n", " }\n", "\n", " /* Handle when an output is cleared or removed */\n", " events.on('clear_output.CodeCell', handleClearOutput);\n", " events.on('delete.Cell', handleClearOutput);\n", "\n", " /* Handle when a new output is added */\n", " events.on('output_added.OutputArea', handleAddOutput);\n", "\n", " /**\n", " * Register the mime type and append_mime function with output_area\n", " */\n", " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", " /* Is output safe? */\n", " safe: true,\n", " /* Index of renderer in `output_area.display_order` */\n", " index: 0\n", " });\n", " }\n", "\n", " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", " if (root.Jupyter !== undefined) {\n", " const events = require('base/js/events');\n", " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", "\n", " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", " register_renderer(events, OutputArea);\n", " }\n", " }\n", "\n", " \n", " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", " root._bokeh_timeout = Date.now() + 5000;\n", " root._bokeh_failed_load = false;\n", " }\n", "\n", " const NB_LOAD_WARNING = {'data': {'text/html':\n", " \"\\n\"+\n", " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", " \"
\\n\"+\n", " \"\\n\"+\n",
" \"from bokeh.resources import INLINE\\n\"+\n",
" \"output_notebook(resources=INLINE)\\n\"+\n",
" \"
\\n\"+\n",
" \"\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"
\\n\"+\n \"\n", " | assembly_accession | \n", "bioproject | \n", "biosample | \n", "wgs_master | \n", "refseq_category | \n", "taxid | \n", "species_taxid | \n", "organism_name | \n", "infraspecific_name | \n", "isolate | \n", "... | \n", "genome_rep | \n", "seq_rel_date | \n", "asm_name | \n", "submitter | \n", "gbrs_paired_asm | \n", "paired_asm_comp | \n", "ftp_path | \n", "excluded_from_refseq | \n", "relation_to_type_material | \n", "asm_not_live_date | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
18 | \n", "GCF_000002515.2 | \n", "PRJNA12377 | \n", "SAMEA3138170 | \n", "NaN | \n", "representative genome | \n", "28985 | \n", "28985 | \n", "Kluyveromyces lactis | \n", "strain=NRRL Y-1140 | \n", "NaN | \n", "... | \n", "Full | \n", "2004/07/02 | \n", "ASM251v1 | \n", "Genolevures Consortium | \n", "GCA_000002515.1 | \n", "different | \n", "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... | \n", "NaN | \n", "NaN | \n", "na | \n", "
24 | \n", "GCF_000002725.2 | \n", "PRJNA15564 | \n", "SAMEA3138173 | \n", "NaN | \n", "representative genome | \n", "347515 | \n", "5664 | \n", "Leishmania major strain Friedlin | \n", "strain=Friedlin | \n", "NaN | \n", "... | \n", "Full | \n", "2011/02/14 | \n", "ASM272v2 | \n", "Friedlin Consortium | \n", "GCA_000002725.2 | \n", "identical | \n", "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... | \n", "NaN | \n", "NaN | \n", "na | \n", "
25 | \n", "GCF_000002765.5 | \n", "PRJNA148 | \n", "SAMN00102897 | \n", "NaN | \n", "representative genome | \n", "36329 | \n", "5833 | \n", "Plasmodium falciparum 3D7 | \n", "NaN | \n", "3D7 | \n", "... | \n", "Full | \n", "2016/04/07 | \n", "GCA_000002765 | \n", "Plasmodium falciparum Genome Sequencing Consor... | \n", "GCA_000002765.3 | \n", "different | \n", "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... | \n", "NaN | \n", "NaN | \n", "na | \n", "
34 | \n", "GCF_000002985.6 | \n", "PRJNA158 | \n", "SAMEA3138177 | \n", "NaN | \n", "reference genome | \n", "6239 | \n", "6239 | \n", "Caenorhabditis elegans | \n", "strain=Bristol N2 | \n", "NaN | \n", "... | \n", "Full | \n", "2013/02/07 | \n", "WBcel235 | \n", "C. elegans Sequencing Consortium | \n", "GCA_000002985.3 | \n", "different | \n", "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... | \n", "NaN | \n", "NaN | \n", "na | \n", "
65 | \n", "GCF_000005825.2 | \n", "PRJNA224116 | \n", "SAMN02603086 | \n", "NaN | \n", "na | \n", "398511 | \n", "79885 | \n", "Alkalihalobacillus pseudofirmus OF4 | \n", "strain=OF4 | \n", "NaN | \n", "... | \n", "Full | \n", "2010/12/15 | \n", "ASM582v2 | \n", "Center for Genomic Sciences, Allegheny-Singer ... | \n", "GCA_000005825.2 | \n", "identical | \n", "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... | \n", "NaN | \n", "NaN | \n", "na | \n", "
5 rows × 23 columns
\n", "\n", " | taxid | \n", "genome_size | \n", "species | \n", "phylum | \n", "clade | \n", "kingdom | \n", "superkingdom | \n", "
---|---|---|---|---|---|---|---|
accession | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
GCF_002191655.1 | \n", "29459 | \n", "3312719 | \n", "Brucella melitensis | \n", "Proteobacteria | \n", "<NA> | \n", "<NA> | \n", "Bacteria | \n", "
GCF_000852745.1 | \n", "103881 | \n", "17266 | \n", "Potato yellow vein virus | \n", "Kitrinoviricota | \n", "Riboviria | \n", "Orthornavirae | \n", "Viruses | \n", "
GCF_002197575.1 | \n", "1983777 | \n", "14964 | \n", "Avian metaavulavirus 15 | \n", "Negarnaviricota | \n", "Riboviria | \n", "Orthornavirae | \n", "Viruses | \n", "
GCF_006384535.1 | \n", "2588128 | \n", "59514 | \n", "Gordonia phage Barb | \n", "Uroviricota | \n", "Duplodnaviria | \n", "Heunggongvirae | \n", "Viruses | \n", "
GCF_000025865.1 | \n", "547558 | \n", "2012424 | \n", "Methanohalophilus mahii | \n", "Euryarchaeota | \n", "Stenosarchaea group | \n", "<NA> | \n", "Archaea | \n", "
\n", " | GCF_002191655.1 | \n", "GCF_000852745.1 | \n", "GCF_002197575.1 | \n", "GCF_006384535.1 | \n", "GCF_000025865.1 | \n", "GCF_000019085.1 | \n", "GCF_000800395.1 | \n", "GCF_000861705.1 | \n", "GCF_000879055.1 | \n", "GCF_014127105.1 | \n", "... | \n", "GCF_002448155.1 | \n", "GCF_016026895.1 | \n", "GCF_003595175.1 | \n", "GCF_019192625.1 | \n", "GCF_018289355.1 | \n", "GCF_007954485.1 | \n", "GCF_016403105.1 | \n", "GCF_011765625.1 | \n", "GCF_900638255.1 | \n", "GCF_015571675.1 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
kmer | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
AAAAA | \n", "7574 | \n", "120 | \n", "51 | \n", "3 | \n", "15189 | \n", "37345 | \n", "2727 | \n", "63 | \n", "13 | \n", "6061 | \n", "... | \n", "1467 | \n", "16316 | \n", "4522 | \n", "22659 | \n", "36468 | \n", "42322 | \n", "12171 | \n", "33204 | \n", "7380 | \n", "23280 | \n", "
AAAAT | \n", "7646 | \n", "160 | \n", "63 | \n", "6 | \n", "11130 | \n", "27333 | \n", "1677 | \n", "44 | \n", "8 | \n", "6781 | \n", "... | \n", "1625 | \n", "11873 | \n", "2734 | \n", "17629 | \n", "28855 | \n", "34908 | \n", "11573 | \n", "28167 | \n", "6670 | \n", "18076 | \n", "
AAAAC | \n", "7884 | \n", "72 | \n", "33 | \n", "11 | \n", "7672 | \n", "12339 | \n", "2710 | \n", "47 | \n", "11 | \n", "7593 | \n", "... | \n", "3149 | \n", "13131 | \n", "4546 | \n", "16801 | \n", "19309 | \n", "22568 | \n", "10829 | \n", "20225 | \n", "8343 | \n", "17542 | \n", "
AAAAG | \n", "9115 | \n", "77 | \n", "29 | \n", "4 | \n", "10460 | \n", "24769 | \n", "3071 | \n", "40 | \n", "7 | \n", "7159 | \n", "... | \n", "2251 | \n", "6911 | \n", "3583 | \n", "12260 | \n", "26820 | \n", "23141 | \n", "8646 | \n", "18385 | \n", "4012 | \n", "12757 | \n", "
AAATA | \n", "4774 | \n", "119 | \n", "56 | \n", "11 | \n", "9577 | \n", "21599 | \n", "1022 | \n", "19 | \n", "12 | \n", "3594 | \n", "... | \n", "1328 | \n", "6824 | \n", "2166 | \n", "11850 | \n", "28194 | \n", "27460 | \n", "5007 | \n", "23131 | \n", "4916 | \n", "11996 | \n", "
5 rows × 36277 columns
\n", "\n", " | UMAP0 | \n", "UMAP1 | \n", "taxonomic_rank | \n", "genome_size | \n", "
---|---|---|---|---|
accession | \n", "\n", " | \n", " | \n", " | \n", " |
GCF_002191655.1 | \n", "18.933571 | \n", "1.356702 | \n", "Bacteria | \n", "3312719 | \n", "
GCF_000852745.1 | \n", "7.202410 | \n", "5.276213 | \n", "Viruses | \n", "17266 | \n", "
GCF_002197575.1 | \n", "7.665549 | \n", "3.549558 | \n", "Viruses | \n", "14964 | \n", "
GCF_006384535.1 | \n", "17.502321 | \n", "-1.682356 | \n", "Viruses | \n", "59514 | \n", "
GCF_000025865.1 | \n", "8.906975 | \n", "5.523824 | \n", "Archaea | \n", "2012424 | \n", "