The Art of Life

Nature is remarkably complex and offers a plethora of intricate patterns to those who dare to investigate. These patterns can appear in many forms. Here, we will take a look at how all (sequenced) organisms relate to each other when projecting their high-dimensional genome space down to two dimensions.

[1]:
import os
import gzip
import json
import shutil
import functools
from pathlib import Path
import multiprocessing as mp
from concurrent.futures import Future, as_completed, ProcessPoolExecutor

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

import requests
from requests_futures.sessions import FuturesSession

import khmer
import metagenompy
from Bio import SeqIO

import umap
import umap.plot

import joblib
from tqdm.auto import tqdm
[2]:
%matplotlib inline
umap.plot.output_notebook()
Loading BokehJS ...
[3]:
CPU_COUNT = 10
DATA_DIR = Path("genome_data")

Retrieve Complete Genomes

Before analyzing the genomes, we need to download them. Here, we are going to download (a subset of) all genomes available on RefSeq.

[4]:
genome_dir = DATA_DIR / "genomes"
genome_dir.mkdir(parents=True, exist_ok=True)
[5]:
# download assembly info
url = (
    "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt"
)
path_assembly = DATA_DIR / "assembly_information.tsv"

if not path_assembly.exists():
    resp = requests.get(url, stream=True, allow_redirects=True)
    resp.raw.read = functools.partial(resp.raw.read, decode_content=True)
    with tqdm.wrapattr(resp.raw, "read", desc="Download assembly info") as resp_raw:
        with path_assembly.open("wb") as fd:
            shutil.copyfileobj(resp_raw, fd)
[6]:
# parse assembly info
df_assembly = pd.read_csv(path_assembly, sep="\t", skiprows=1)

# cleaning
df_assembly.rename(
    columns={df_assembly.columns[0]: df_assembly.columns[0].lstrip("#").lstrip()},
    inplace=True,
)

# subsetting
df_assembly = df_assembly[
    (df_assembly["ftp_path"] != "na")
    & (df_assembly["genome_rep"] == "Full")
    & df_assembly["excluded_from_refseq"].isna()
    & (df_assembly["assembly_level"] == "Complete Genome")
]

# summary
print(df_assembly.shape)
df_assembly.head()
(36277, 23)
[6]:
assembly_accession bioproject biosample wgs_master refseq_category taxid species_taxid organism_name infraspecific_name isolate ... genome_rep seq_rel_date asm_name submitter gbrs_paired_asm paired_asm_comp ftp_path excluded_from_refseq relation_to_type_material asm_not_live_date
18 GCF_000002515.2 PRJNA12377 SAMEA3138170 NaN representative genome 28985 28985 Kluyveromyces lactis strain=NRRL Y-1140 NaN ... Full 2004/07/02 ASM251v1 Genolevures Consortium GCA_000002515.1 different https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... NaN NaN na
24 GCF_000002725.2 PRJNA15564 SAMEA3138173 NaN representative genome 347515 5664 Leishmania major strain Friedlin strain=Friedlin NaN ... Full 2011/02/14 ASM272v2 Friedlin Consortium GCA_000002725.2 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... NaN NaN na
25 GCF_000002765.5 PRJNA148 SAMN00102897 NaN representative genome 36329 5833 Plasmodium falciparum 3D7 NaN 3D7 ... Full 2016/04/07 GCA_000002765 Plasmodium falciparum Genome Sequencing Consor... GCA_000002765.3 different https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... NaN NaN na
34 GCF_000002985.6 PRJNA158 SAMEA3138177 NaN reference genome 6239 6239 Caenorhabditis elegans strain=Bristol N2 NaN ... Full 2013/02/07 WBcel235 C. elegans Sequencing Consortium GCA_000002985.3 different https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... NaN NaN na
65 GCF_000005825.2 PRJNA224116 SAMN02603086 NaN na 398511 79885 Alkalihalobacillus pseudofirmus OF4 strain=OF4 NaN ... Full 2010/12/15 ASM582v2 Center for Genomic Sciences, Allegheny-Singer ... GCA_000005825.2 identical https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0... NaN NaN na

5 rows × 23 columns

[7]:
def download_genome(row, target_dir, session):
    name = row.ftp_path.rsplit("/", 1)[-1]
    fname = f"{name}_genomic.fna.gz"

    path = target_dir / fname
    meta_path = f"{path}.json"

    url = f"{row.ftp_path}/{fname}"

    if path.exists():
        # print("Using cache for", url)
        future = Future()
        future.set_result("foo")
    else:
        # print("Downloading", url)
        future = session.get(url)

    future.path = path
    future.meta_path = meta_path
    future.accession = row.assembly_accession
    future.taxid = row.taxid

    return future

To speed up the download (which is IO bound), we will distribute this task over multiple threads.

[8]:
with FuturesSession(max_workers=CPU_COUNT) as session:
    futures = df_assembly.apply(
        download_genome, axis=1, args=(genome_dir, session)
    ).tolist()

    for future in tqdm(
        as_completed(futures), total=len(futures), desc="Download genomes"
    ):
        resp = future.result()

        if isinstance(resp, requests.models.Response):
            with open(future.meta_path, "w") as fd:
                json.dump({"accession": future.accession, "taxid": future.taxid}, fd)

            with open(future.path, mode="wb") as fd:
                fd.write(resp.content)

Compute Genomic Features

We are going to represent DNA sequences by their kmer count profile. In addition, we will compute further statistics, such as, for example, the number of base pairs in each genome.

Helper functions

[9]:
def load_files(path):
    """Retrieve sequence and metadata for given entry."""
    with gzip.open(path, "rt") as fd:
        record_list = list(SeqIO.parse(fd, "fasta"))

    # aggregate sequences
    seq = ""
    for record in record_list:
        seq += str(record.seq)
    seq = seq.upper()

    # get metadata
    path_meta = f"{path}.json"
    with open(path_meta) as fd:
        metadata = json.load(fd)

    return seq, metadata
[10]:
def compute_kmer_counts(seq, k=3):
    """
    https://github.com/dib-lab/khmer/blob/master/examples/python-api/exact-counting.py
    """
    # setup counter
    nkmers = 4**k
    tablesize = nkmers + 10

    cg = khmer.Countgraph(k, tablesize, 1)
    cg.set_use_bigcount(True)  # increase max count from 255 to 65535

    # count kmers
    cg.consume(seq)

    # return formatted output
    return {cg.reverse_hash(i): cg.get(i) for i in range(nkmers)}
[11]:
def parse_entry(path):
    """Do all computations for single genome file."""
    assert str(path).endswith("_genomic.fna.gz"), path

    # parse entry
    seq, meta = load_files(path)

    # handle meta information
    meta["genome_size"] = len(seq)

    # count kerms
    kmer_counts = compute_kmer_counts(seq, k=5)

    return meta, kmer_counts

Parse data

As computing these features is CPU bound, we are going to make use of multiprocessing.

[12]:
kmer_data = {}
metadata = []

with ProcessPoolExecutor(
    max_workers=CPU_COUNT, mp_context=mp.get_context("fork")
) as executor:
    futures = [
        executor.submit(parse_entry, path)
        for path in genome_dir.iterdir()
        if str(path).endswith("_genomic.fna.gz")
    ]

    for future in tqdm(as_completed(futures), total=len(futures), desc="Parse genomes"):
        # compute stuff
        meta, kmer_counts = future.result()

        # keep results
        metadata.append(meta)
        id_ = meta["accession"]

        assert id_ not in kmer_data
        kmer_data[id_] = kmer_counts

Determine taxonomic lineage for each entry

To investigate how different types of organisms relate to each other, we will characterize each organisms by its taxonomic rank.

[13]:
# list of which ranks to consider
rank_list = ["species", "phylum", "clade", "kingdom", "superkingdom"]
[14]:
graph = metagenompy.generate_taxonomy_network(auto_download=True)
Parsing names: 100%|██████████| 3532357/3532357 [00:04<00:00, 721381.84it/s]
Parsing nodes: 100%|██████████| 2388279/2388279 [00:21<00:00, 112286.89it/s]
[15]:
df_meta = pd.DataFrame(metadata).set_index("accession")
df_meta["taxid"] = df_meta["taxid"].astype(str)

df_meta = metagenompy.classify_dataframe(graph, df_meta, rank_list=rank_list)
Classifying: 100%|██████████| 5/5 [00:02<00:00,  2.22it/s]

Save results

[16]:
df_meta.to_csv(DATA_DIR / "metadata.csv.gz")
df_meta.head()
[16]:
taxid genome_size species phylum clade kingdom superkingdom
accession
GCF_002191655.1 29459 3312719 Brucella melitensis Proteobacteria <NA> <NA> Bacteria
GCF_000852745.1 103881 17266 Potato yellow vein virus Kitrinoviricota Riboviria Orthornavirae Viruses
GCF_002197575.1 1983777 14964 Avian metaavulavirus 15 Negarnaviricota Riboviria Orthornavirae Viruses
GCF_006384535.1 2588128 59514 Gordonia phage Barb Uroviricota Duplodnaviria Heunggongvirae Viruses
GCF_000025865.1 547558 2012424 Methanohalophilus mahii Euryarchaeota Stenosarchaea group <NA> Archaea
[17]:
df_kmer = pd.DataFrame(kmer_data)
df_kmer.index.name = "kmer"

df_kmer.to_csv(DATA_DIR / "kmer_counts.csv.gz")
df_kmer.head()
[17]:
GCF_002191655.1 GCF_000852745.1 GCF_002197575.1 GCF_006384535.1 GCF_000025865.1 GCF_000019085.1 GCF_000800395.1 GCF_000861705.1 GCF_000879055.1 GCF_014127105.1 ... GCF_002448155.1 GCF_016026895.1 GCF_003595175.1 GCF_019192625.1 GCF_018289355.1 GCF_007954485.1 GCF_016403105.1 GCF_011765625.1 GCF_900638255.1 GCF_015571675.1
kmer
AAAAA 7574 120 51 3 15189 37345 2727 63 13 6061 ... 1467 16316 4522 22659 36468 42322 12171 33204 7380 23280
AAAAT 7646 160 63 6 11130 27333 1677 44 8 6781 ... 1625 11873 2734 17629 28855 34908 11573 28167 6670 18076
AAAAC 7884 72 33 11 7672 12339 2710 47 11 7593 ... 3149 13131 4546 16801 19309 22568 10829 20225 8343 17542
AAAAG 9115 77 29 4 10460 24769 3071 40 7 7159 ... 2251 6911 3583 12260 26820 23141 8646 18385 4012 12757
AAATA 4774 119 56 11 9577 21599 1022 19 12 3594 ... 1328 6824 2166 11850 28194 27460 5007 23131 4916 11996

5 rows × 36277 columns

Data overview

Before analyzing the data in more depth, we check whether reasonable kmer counts have been generated and whether the taxonomic classification worked out.

[18]:
# did the kmer counting work
max_count_fraction = (df_kmer >= 65535).sum().sum() / (
    df_kmer.shape[0] * df_kmer.shape[1]
)
print(f"{max_count_fraction * 100:.2f}% of kmer counts have reached numeric maximum")
0.08% of kmer counts have reached numeric maximum
[19]:
# how many NAs are in our taxonomic metadata
df_meta.isna().sum()
[19]:
taxid               0
genome_size         0
species            27
phylum           2041
clade           15606
kingdom         26965
superkingdom       27
dtype: int64

Additionally, we can briefly look at some interesting summary statistics.

[20]:
fig, ax = plt.subplots(figsize=(8, 6))

sns.histplot(data=df_meta, x="genome_size", hue="superkingdom", log_scale=True, ax=ax)

ax.set_xlabel("Genome Size [bp]")
ax.set_yscale("log")

fig.tight_layout()
fig.savefig(DATA_DIR / "genome_size_hist.pdf")
../_images/mining_visualization_ArtOfLife_29_0.png

Kmer statistics

Before reducing the dimensionality of the kmer space, let’s look at a few of its features.

Let’s start by checking the overall kmer count distribution. We can observe a peak at \(0\) as well as a peak at the numeric kmer count maximum of \(65535\).

[21]:
fig, ax = plt.subplots(figsize=(8, 6))

sns.histplot(data=df_kmer.values.ravel(), bins=30, ax=ax)

ax.set_xlabel("Kmer frequency")
ax.set_yscale("log")

fig.tight_layout()
fig.savefig(DATA_DIR / "kmer_count_hist.pdf")
../_images/mining_visualization_ArtOfLife_31_0.png

We then continue by looking at the most/least common kmers averaged over all organisms.

[22]:
kmer_counts = df_kmer.median(axis=1)
kmer_counts = kmer_counts[kmer_counts > 0]

print("Most common kmers:")
print(kmer_counts.head())
print()
print("Least common (non-zero) kmers:")
print(kmer_counts.tail())
Most common kmers:
kmer
AAAAA    8464.0
AAAAT    6449.0
AAAAC    6518.0
AAAAG    6762.0
AAATA    4391.0
dtype: float64

Least common (non-zero) kmers:
kmer
GCCCC    2177.0
GCCGC    3081.0
GCGCC    2676.0
GGACC    1526.0
GGCCC    1114.0
dtype: float64

Finally, we can enjoy a clustered heatmap.

[23]:
%%time

# retain rows with non-zero entries and columns with not-low entries
df_kmer_sub = df_kmer.loc[(df_kmer > 0).any(axis=1), (df_kmer.median(axis=0) > 10)]
df_kmer_sub.columns.rename("organism", inplace=True)

# generate column color map
rank_colors = {
    rank: sns.color_palette("husl", df_meta["superkingdom"].nunique(dropna=False))[i]
    for i, rank in enumerate(df_meta["superkingdom"].unique())
}
rank_cmap = df_meta.loc[df_kmer_sub.columns, "superkingdom"].map(rank_colors)

# create plot
g = sns.clustermap(
    df_kmer_sub,
    col_colors=rank_cmap,
    rasterized=True,
    figsize=(12, 12),
)

g.cax.set_title("kmer count")
g.ax_heatmap.tick_params(bottom=False, labelbottom=False, right=False, labelright=False)

g.ax_heatmap.legend(
    handles=[Patch(facecolor=color, label=name) for name, color in rank_colors.items()],
    title="Superkingdom",
    bbox_to_anchor=(1.05, 1),
    loc="upper left",
)

fig.savefig(DATA_DIR / "kmer_heatmap.pdf", dpi=300)
/cluster/work/bewi/nss/apps/gcc-6.3.0/conda/4.8.3/lib/python3.8/site-packages/seaborn/matrix.py:654: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
CPU times: user 2min 40s, sys: 6.02 s, total: 2min 46s
Wall time: 2min 47s
../_images/mining_visualization_ArtOfLife_35_2.png

Visualize Projected Genome Space

Finally, we can project the high-dimensional kmer space to two dimensions and explore its topology.

[24]:
reducer = umap.UMAP(
    metric="cosine",
    random_state=42,
    low_memory=True,
    verbose=True,
    n_neighbors=100,
    n_jobs=min(CPU_COUNT, 8),
)
[25]:
reducer.fit(df_kmer.T)
UMAP(angular_rp_forest=True, metric='cosine', n_jobs=8, n_neighbors=100, random_state=42, verbose=True)
Wed Jan  5 23:36:01 2022 Construct fuzzy simplicial set
Wed Jan  5 23:36:01 2022 Finding Nearest Neighbors
Wed Jan  5 23:36:01 2022 Building RP forest with 15 trees
Wed Jan  5 23:36:06 2022 NN descent for 15 iterations
         1  /  15
         2  /  15
         3  /  15
        Stopping threshold met -- exiting after 3 iterations
Wed Jan  5 23:37:06 2022 Finished Nearest Neighbor Search
Wed Jan  5 23:37:11 2022 Construct embedding
Wed Jan  5 23:38:22 2022 Finished embedding
[25]:
UMAP(angular_rp_forest=True, metric='cosine', n_jobs=8, n_neighbors=100, random_state=42, verbose=True)
[26]:
joblib.dump(reducer, DATA_DIR / "umap_model.joblib")
Wed Jan  5 23:38:27 2022 Worst tree score: 0.96907131
Wed Jan  5 23:38:27 2022 Mean tree score: 0.97277063
Wed Jan  5 23:38:27 2022 Best tree score: 0.97499793
Wed Jan  5 23:38:37 2022 Forward diversification reduced edges from 3627700 to 258377
Wed Jan  5 23:38:40 2022 Reverse diversification reduced edges from 258377 to 257088
/cluster/home/kimja/.local/lib/python3.8/site-packages/scipy/sparse/_index.py:125: SparseEfficiencyWarning: Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
  self._set_arrayXarray(i, j, x)
Wed Jan  5 23:38:42 2022 Degree pruning reduced edges from 297740 to 297717
Wed Jan  5 23:38:42 2022 Resorting data and graph based on tree order
Wed Jan  5 23:38:42 2022 Building and compiling search function
[26]:
['genome_data/umap_model.joblib']

Static visualization

We can look at a static image…

[27]:
def format_taxonomy_string(taxonomy, rank_list=["superkingdom", "kingdom"]):
    """Convert classification columns to readable string."""
    return ";".join(str(taxonomy[rank]) for rank in rank_list)
[28]:
embedding = reducer.transform(df_kmer.T)

df_umap = pd.DataFrame(embedding)
df_umap.columns = ("UMAP0", "UMAP1")
df_umap["accession"] = df_kmer.columns

df_umap["taxonomic_rank"] = df_umap["accession"].apply(
    lambda x: format_taxonomy_string(df_meta.loc[x], rank_list=["superkingdom"])
)

df_umap.set_index("accession", inplace=True)
df_umap["genome_size"] = df_meta["genome_size"]

df_umap.to_csv(DATA_DIR / "umap.csv.gz")
print(df_umap.shape)
df_umap.head()
(36277, 4)
[28]:
UMAP0 UMAP1 taxonomic_rank genome_size
accession
GCF_002191655.1 18.933571 1.356702 Bacteria 3312719
GCF_000852745.1 7.202410 5.276213 Viruses 17266
GCF_002197575.1 7.665549 3.549558 Viruses 14964
GCF_006384535.1 17.502321 -1.682356 Viruses 59514
GCF_000025865.1 8.906975 5.523824 Archaea 2012424
[29]:
fig, ax = plt.subplots(figsize=(10, 10))

sns.scatterplot(
    data=df_umap,
    x="UMAP0",
    y="UMAP1",
    hue="taxonomic_rank",
    size="genome_size",
    rasterized=True,
    palette=sns.color_palette("husl", df_umap["taxonomic_rank"].nunique()),
    ax=ax,
)

fig.tight_layout()
fig.savefig(DATA_DIR / "umap.pdf", dpi=300)
../_images/mining_visualization_ArtOfLife_43_0.png

Interactive visualization

…but also pan and zoom around in an interactive view.

[30]:
hover_data = df_meta.loc[df_kmer.columns].rename_axis("accession").reset_index()

hover_data["genome_size"] = hover_data["genome_size"].apply(lambda x: f"{x:,} bp")
[31]:
p = umap.plot.interactive(
    reducer,
    labels=df_umap["taxonomic_rank"].reset_index(drop=True),
    theme="fire",
    hover_data=hover_data,
    point_size=2,
)
umap.plot.show(p)