Visualising CDR loop clustering

Introduction

In this notebook, we visualise the clusters of loop conformations determined by the pair-wise distance comparison of all loops.

[1]:

import glob
import os
import subprocess
import tempfile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import logomaker
from ipywidgets import Dropdown, VBox
from IPython.display import display, clear_output
from python_pdb.aligners import align_pandas_structure
from python_pdb.parsers import parse_pdb_to_pandas
from sklearn.manifold import TSNE

from tcr_pmhc_interface_analysis.processing import annotate_tcr_pmhc_df, find_anchors
from tcr_pmhc_interface_analysis.utils import get_coords

[2]:

DATA_DIR = '../data/interim/structure-pw-distances'

[3]:

with open(os.path.join(DATA_DIR, 'structure_names.txt'), 'r') as fh:
    structure_names = [line.strip() for line in fh.readlines()]

[4]:

cdr_clusters = pd.read_csv('../data/processed/stcrdab_clusters.csv')
cdr_clusters

[4]:

	name	cluster	chain_type	cdr	sequence	cluster_type
0	7zt2_DE	12	alpha_chain	1	TSGFNG	pseudo
1	7zt3_DE	12	alpha_chain	1	TSGFNG	pseudo
2	7zt4_DE	12	alpha_chain	1	TSGFNG	pseudo
3	7zt5_DE	12	alpha_chain	1	TSGFNG	pseudo
4	7zt7_DE	12	alpha_chain	1	TSGFNG	pseudo
...	...	...	...	...	...	...
4807	6miv_CD	22	beta_chain	3	ASGDEGYTQY	canonical
4808	3rtq_CD	22	beta_chain	3	ASGDEGYTQY	canonical
4809	3dxa_NO	noise	beta_chain	3	ASRYRDDSYNEQF	NaN
4810	1d9k_AB	noise	beta_chain	3	ASGGQGRAEQF	NaN
4811	4gg6_GH	noise	beta_chain	3	ASSVAVSAGTYEQY	NaN

4812 rows × 6 columns

[5]:

df = pd.DataFrame()

for path in glob.glob(os.path.join(DATA_DIR, '*_distance_matrix.txt*')):
    name = os.path.basename(path).split('.')[0].replace('_distance_matrix', '')

    cdr, chain = name.split('_')

    cdr_distance_matrix = np.loadtxt(path)

    cdr_tsne = TSNE(metric='precomputed', init='random', learning_rate='auto').fit_transform(cdr_distance_matrix)

    cdr_df = pd.DataFrame({
        'name': structure_names,
        'tsne_1': cdr_tsne[:, 0],
        'tsne_2': cdr_tsne[:, 1],
    })
    cdr_df['chain_type'] = chain + '_chain'
    cdr_df['cdr'] = int(cdr.replace('cdr', ''))

    df = pd.concat([df, cdr_df])

df

[5]:

	name	tsne_1	tsne_2	chain_type	cdr
0	7zt2_DE	-12.995802	14.408054	beta_chain	2
1	7zt3_DE	-12.007703	13.610337	beta_chain	2
2	7zt4_DE	-12.722438	14.317860	beta_chain	2
3	7zt5_DE	-12.896334	14.089248	beta_chain	2
4	7zt7_DE	-12.443920	14.690278	beta_chain	2
...	...	...	...	...	...
797	6miv_CD	-7.597044	46.012802	alpha_chain	3
798	3rtq_CD	-11.724574	50.600700	alpha_chain	3
799	3dxa_NO	-28.833708	18.615999	alpha_chain	3
800	1d9k_AB	-18.946213	13.313275	alpha_chain	3
801	4gg6_GH	-25.602039	18.878851	alpha_chain	3

4812 rows × 5 columns

[6]:

df = df.merge(cdr_clusters, how='inner', on=['name', 'chain_type', 'cdr'])
df

[6]:

	name	tsne_1	tsne_2	chain_type	cdr	cluster	sequence	cluster_type
0	7zt2_DE	-12.995802	14.408054	beta_chain	2	10	SASEGT	pseudo
1	7zt3_DE	-12.007703	13.610337	beta_chain	2	10	SASEGT	pseudo
2	7zt4_DE	-12.722438	14.317860	beta_chain	2	10	SASEGT	pseudo
3	7zt5_DE	-12.896334	14.089248	beta_chain	2	10	SASEGT	pseudo
4	7zt7_DE	-12.443920	14.690278	beta_chain	2	10	SASEGT	pseudo
...	...	...	...	...	...	...	...	...
4807	6miv_CD	-7.597044	46.012802	alpha_chain	3	13	VVGDRGSALGRLH	canonical
4808	3rtq_CD	-11.724574	50.600700	alpha_chain	3	13	VVGDRGSALGRLH	canonical
4809	3dxa_NO	-28.833708	18.615999	alpha_chain	3	6	IVWGGYQKVT	canonical
4810	1d9k_AB	-18.946213	13.313275	alpha_chain	3	noise	AATGSFNKLT	NaN
4811	4gg6_GH	-25.602039	18.878851	alpha_chain	3	noise	ILRDGRGGADGLT	NaN

4812 rows × 8 columns

[7]:

g = sns.FacetGrid(df.sort_values(['chain_type', 'cdr']), row='chain_type', col='cdr', sharex=False, sharey=False)
g.map(sns.scatterplot, 'tsne_1', 'tsne_2', 'cluster')
g.add_legend()

[7]:

<seaborn.axisgrid.FacetGrid at 0x7f251c37b460>

../_images/source_Visualising_CDR_loop_clustering_8_1.png

[8]:

df.query("cluster != 'noise'").groupby(['chain_type', 'cdr'])['cluster'].nunique()

[8]:

chain_type   cdr
alpha_chain  1      23
             2      18
             3      30
beta_chain   1       8
             2      16
             3      28
Name: cluster, dtype: int64

Do the canonical clusters exist across different loop lengths?

[9]:

df['cdr_length'] = df['sequence'].map(len)

[10]:

df.query("cluster != 'noise'").groupby(['chain_type', 'cdr', 'cluster'])['cdr_length'].nunique().value_counts()

[10]:

1    107
2     14
3      2
Name: cdr_length, dtype: int64

Some clustering with different lengths!

Visualizing the canonical cluster structures and sequences

Sequence motifs for each canonical cluster

[11]:

def format_fasta(group):
    index = group.index.tolist()
    sequence = group['sequence'].tolist()

    fasta = []

    for idx, sequence in zip(index, sequence):
        fasta.append(f'>{idx}\n{sequence}')

    return '\n'.join(fasta)

[12]:

for (chain_type, cdr, cluster), group in (df.query("cluster != 'noise' and cluster_type == 'canonical'")
                                            .groupby(['chain_type', 'cdr', 'cluster'])):
    with tempfile.TemporaryDirectory() as directory:
        input_file = os.path.join(directory, 'input.fasta')
        output_file = os.path.join(directory, 'output.fasta')

        with open(input_file, 'w') as fh:
            fh.write(format_fasta(group))

        cmd = f'clustalw2 -INFILE={input_file} -OUTFILE={output_file} -OUTPUT=FASTA'
        subprocess.run(cmd.split(), check=True, stdout=subprocess.DEVNULL)

        with open(output_file, 'r') as fh:
            aligned_sequences = [line.strip() for line in fh.readlines() if not line.startswith('>')]


    probs = logomaker.alignment_to_matrix(aligned_sequences, to_type='information')
    logomaker.Logo(probs, color_scheme='chemistry')

    cluster_name = f"CDR-{'A' if chain_type == 'alpha_chain' else 'B'}{cdr} - Cluster {cluster}"

    plt.title(cluster_name)
    plt.ylabel('Bits')
    plt.xlabel('Position')

    plt.show()

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_1.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_3.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_5.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_7.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_9.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_11.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_13.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_15.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_17.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_19.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_21.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_23.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_25.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_27.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_29.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_31.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_33.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_35.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_37.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_39.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_41.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_43.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_45.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_47.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_49.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_51.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_53.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_55.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_57.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_59.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_61.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_63.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_65.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_67.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_69.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_71.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_73.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_75.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_77.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_79.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_81.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_83.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_85.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_87.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_89.png

/project/koohylab/bmcmaste/miniconda3/envs/tcr-pmhc-interface-analysis/lib/python3.10/site-packages/logomaker/src/matrix.py:584: FutureWarning: In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`
  counts_df.loc[:, c] = tmp_mat.sum(axis=0).T

../_images/source_Visualising_CDR_loop_clustering_17_91.png

Look at backbone traces of each cluster

[13]:

ca_coordinates = []
names = []
chain_types = []
cdrs = []
clusters = []

for (chain_type, cdr, cluster), group in (df.query("cluster != 'noise' and cluster_type == 'canonical'")
                                            .groupby(['chain_type', 'cdr', 'cluster'])):
    pdb_id, chains = group.iloc[0]['name'].split('_')
    alpha_chain_id, beta_chain_id = tuple(chains)
    cdr = int(cdr)

    with open(os.path.join('../data/raw/stcrdab', 'imgt', pdb_id + '.pdb'), 'r') as fh:
        reference_structure_df = parse_pdb_to_pandas(fh.read())

    reference_structure_df = annotate_tcr_pmhc_df(reference_structure_df, alpha_chain_id, beta_chain_id)
    reference_tcr_df = reference_structure_df.query('chain_type.notnull()')

    reference_cdr_df = reference_tcr_df.query(("chain_type == @chain_type and cdr == @cdr"))
    refrence_anchors = pd.concat(find_anchors(reference_cdr_df, reference_tcr_df, 5))

    for _, row in group.iterrows():
        pdb_id, chains = row['name'].split('_')
        alpha_chain_id, beta_chain_id = tuple(chains)

        chain_type = row.chain_type
        cdr = int(row.cdr)

        with open(os.path.join('../data/raw/stcrdab', 'imgt', pdb_id + '.pdb'), 'r') as fh:
            structure_df = parse_pdb_to_pandas(fh.read())

        structure_df = annotate_tcr_pmhc_df(structure_df, alpha_chain_id, beta_chain_id)
        tcr_df = structure_df.query('chain_type.notnull()')

        cdr_df = tcr_df.query(("chain_type == @chain_type and cdr == @cdr"))
        anchors = pd.concat(find_anchors(cdr_df, tcr_df, 5))

        cdr_df = align_pandas_structure(
            get_coords(anchors.query("atom_name == 'N' or atom_name == 'C' or atom_name == 'CA' or atom_name == 'O'")),
            get_coords(refrence_anchors.query(("atom_name == 'N' or atom_name == 'C' "
                                               "or atom_name == 'CA' or atom_name == 'O'"))),
            cdr_df,
        )

        coords = get_coords(cdr_df.query("atom_name == 'CA'"))

        ca_coordinates.append(coords)
        chain_types.append(chain_type)
        cdrs.append(cdr)
        clusters.append(cluster)
        names.append(row['name'])

ca_coordinate_df = pd.DataFrame({
    'ca_coordinate': ca_coordinates,
    'chain_type': chain_types,
    'cdr': cdrs,
    'name': names,
    'cluster': clusters,
})

ca_coordinate_df = ca_coordinate_df.explode('ca_coordinate')
ca_coordinate_df[['ca_coordinate_x',
                  'ca_coordinate_y',
                  'ca_coordinate_z']] = ca_coordinate_df['ca_coordinate'].apply(pd.Series)
ca_coordinate_df = ca_coordinate_df.drop('ca_coordinate', axis=1)
ca_coordinate_df

[13]:

	chain_type	cdr	name	cluster	ca_coordinate_x	ca_coordinate_y	ca_coordinate_z
0	alpha_chain	1	4ozg_EF	1	224.155000	40.712000	219.187000
0	alpha_chain	1	4ozg_EF	1	227.900000	39.718000	219.665000
0	alpha_chain	1	4ozg_EF	1	229.249000	39.147000	223.257000
0	alpha_chain	1	4ozg_EF	1	232.636000	40.071000	224.817000
0	alpha_chain	1	4ozg_EF	1	234.200000	36.568000	224.708000
...	...	...	...	...	...	...	...
2245	beta_chain	3	3gsn_AB	7	-55.580629	12.969075	-2.638147
2245	beta_chain	3	3gsn_AB	7	-56.991148	13.552541	-6.135106
2245	beta_chain	3	3gsn_AB	7	-55.012334	12.367820	-9.171847
2245	beta_chain	3	3gsn_AB	7	-53.256231	14.847797	-11.441973
2245	beta_chain	3	3gsn_AB	7	-54.906846	15.594755	-14.752955

15537 rows × 7 columns

[14]:

ca_coordinate_df['cluster_name'] = (
    'CDR-'
    + ca_coordinate_df['chain_type'].map(lambda chain_type: 'A' if chain_type == 'alpha_chain' else 'B')
    + ca_coordinate_df['cdr'].apply(str)
    + ' - Cluster '
    + ca_coordinate_df['cluster']
)

[15]:

for cluster_name, cluster_df in ca_coordinate_df.groupby('cluster_name'):
    fig = px.line_3d()

    cluster_df = ca_coordinate_df.query('cluster_name == @cluster_name')
    for _, cdr_loop in cluster_df.groupby('name'):
        fig.add_scatter3d(x=cdr_loop['ca_coordinate_x'],
                          y=cdr_loop['ca_coordinate_y'],
                          z=cdr_loop['ca_coordinate_z'],
                          mode='lines',
                          line=dict(color='green'),
                          showlegend=False)

    fig.update_layout(title=cluster_name)

[16]:

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        clear_output(wait=True)

        dropdown = Dropdown(options=cluster_names, description='Select Cluster')
        dropdown.observe(on_change)
        display(VBox([dropdown]))

        plot_cluster(change['new'])

def plot_cluster(cluster_name):
    fig = px.line_3d()

    cluster_df = ca_coordinate_df.query('cluster_name == @cluster_name')
    for _, cdr_loop in cluster_df.groupby('name'):
        fig.add_scatter3d(x=cdr_loop['ca_coordinate_x'],
                          y=cdr_loop['ca_coordinate_y'],
                          z=cdr_loop['ca_coordinate_z'],
                          mode='lines',
                          line=dict(color='green'),
                          showlegend=False)

    fig.update_layout(title=cluster_name)
    fig.show()

cluster_names = ca_coordinate_df['cluster_name'].unique().tolist()

dropdown = Dropdown(options=cluster_names, description='Select Cluster')
dropdown.observe(on_change)
display(VBox([dropdown]))

plot_cluster(cluster_names[0])