Identify contact residues on MHC Class I molecules

Introduction

In this notebook, we aim to determine the IMGT positions of MHC molecules that make contact with the CDR loops of a contacting TCR. These contacts are then plotted based on the identity of the TCR loop to create a finger print of the TCRs on MHC molecules. We also look at the contacts made by TCRs on the presented peptide.

[1]:

import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from python_pdb.parsers import parse_pdb_to_pandas

from tcr_pmhc_interface_analysis.imgt_numbering import assign_cdr_number

[2]:

CUTOFF_DISTANCE = 5 # Å

Loading TCR:pMHC-I Structures

[3]:

STCRDAB_PATH = '../data/raw/stcrdab'

[4]:

stcrdab_summary = pd.read_csv(os.path.join(STCRDAB_PATH, 'db_summary.dat'), delimiter='\t')

selected_stcrdab = stcrdab_summary.copy()

# Resolution better than 3.50 Å
selected_stcrdab['resolution'] = pd.to_numeric(selected_stcrdab['resolution'], errors='coerce')
selected_stcrdab = selected_stcrdab.query("resolution <= 3.50")

# alpha-beta TCRs
selected_stcrdab = selected_stcrdab.query("TCRtype == 'abTCR'")

# MHC class I
selected_stcrdab = selected_stcrdab.query("mhc_type == 'MH1'")

# peptide antigen
selected_stcrdab = selected_stcrdab.query("antigen_type == 'peptide'")

# General clean: drop columns that don't contain anything useful
selected_stcrdab = selected_stcrdab.loc[:, selected_stcrdab.nunique() > 1]
selected_stcrdab = selected_stcrdab.dropna(axis=1, how='all')

# Reset Index
selected_stcrdab = selected_stcrdab.reset_index(drop=True)

selected_stcrdab

[4]:

	pdb	Bchain	Achain	antigen_chain	antigen_name	mhc_chain1	mhc_chain2	docking_angle	beta_subgroup	alpha_subgroup	...	alpha_organism	antigen_organism	mhc_chain1_organism	mhc_chain2_organism	authors	resolution	method	r_free	r_factor	engineered
0	8gom	E	D	C	spike protein s2	A	B	39.649	NaN	NaN	...	homo sapiens	severe acute respiratory syndrome coronavirus2	homo sapiens	homo sapiens	Wu, D., Mariuzza, R.A.	2.783	X-RAY DIFFRACTION	0.248	0.195	True
1	8gon	E	D	C	spike protein s2	A	B	38.984	NaN	NaN	...	homo sapiens	severe acute respiratory syndrome coronavirus2	homo sapiens	homo sapiens	Wu, D., Mariuzza, R.A.	2.601	X-RAY DIFFRACTION	0.253	0.198	True
2	7q99	E	D	C	asn-leu-ser-ala-leu-gly-ile-phe-ser-thr	A	B	46.371	TRBV30	TRAV12	...	homo sapiens	homo sapiens	homo sapiens	homo sapiens	Rizkallah, P.J., Sewell, A.K., Wall, A., Fulle...	2.550	X-RAY DIFFRACTION	0.272	0.218	True
3	7q9a	E	D	C	leu-leu-leu-gly-ile-gly-ile-leu-val-leu	A	B	48.391	TRBV30	TRAV12	...	homo sapiens	homo sapiens	homo sapiens	homo sapiens	Rizkallah, P.J., Sewell, A.K., Wall, A., Fulle...	2.100	X-RAY DIFFRACTION	0.243	0.205	True
4	2ak4	E	D	C	ebv peptide lpeplpqgqltay	A	B	71.108	TRBV6	TRAV19	...	homo sapiens	NaN	homo sapiens	homo sapiens	Tynan, F.E., Burrows, S.R., Buckle, A.M., Clem...	2.500	X-RAY DIFFRACTION	0.278	0.246	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
297	6q3s	E	D	C	ser-leu-leu-met-trp-ile-thr-gln-val	A	B	67.059	TRBV6	TRAV21	...	homo sapiens	homo sapiens	homo sapiens	homo sapiens	Meijers, R., Anjanappa, R., Springer, S., Garc...	2.500	X-RAY DIFFRACTION	0.273	0.229	True
298	5men	E	D	C	ile-leu-ala-lys-phe-leu-his-trp-leu	A	B	40.823	TRBV6	TRAV22	...	homo sapiens	homo sapiens	homo sapiens	homo sapiens	Rizkallah, P.J., Lloyd, A., Crowther, M., Cole...	2.810	X-RAY DIFFRACTION	0.272	0.189	True
299	1ao7	E	D	C	tax peptide	A	B	34.827	TRBV6	TRAV12	...	homo sapiens	human t-lymphotropic virus 1	homo sapiens	homo sapiens	Garboczi, D.N., Ghosh, P., Utz, U., Fan, Q.R.,...	2.600	X-RAY DIFFRACTION	0.320	0.245	True
300	4jff	E	D	C	melanoma motif	A	B	42.977	TRBV30	TRAV12	...	homo sapiens	homo sapiens	homo sapiens	homo sapiens	Rizkallah, P.J., Cole, D.K., Madura, F., Sewel...	2.430	X-RAY DIFFRACTION	0.263	0.210	True
301	3dxa	O	N	M	ebv decapeptide epitope	K	L	58.225	TRBV7	TRAV26	...	homo sapiens	NaN	homo sapiens	homo sapiens	Archbold, J.K., Macdonald, W.A., Gras, S., Ros...	3.500	X-RAY DIFFRACTION	0.330	0.286	True

302 rows × 24 columns

Determine contacting residues

[5]:

contacts = []

for _, row in selected_stcrdab.iterrows():
    path = os.path.join(STCRDAB_PATH, 'imgt', f'{row.pdb}.pdb')

    with open(path, 'r') as fh:
        df = parse_pdb_to_pandas(fh.read())

    chain_annotations = {}
    chain_annotations[row.Achain] = 'alpha_chain'
    chain_annotations[row.Bchain] = 'beta_chain'
    chain_annotations[row.antigen_chain] = 'antigen_chain'
    chain_annotations[row.mhc_chain1] = 'mhc_chain'

    df['chain_type'] = df['chain_id'].map(lambda id_: chain_annotations[id_] if id_ in chain_annotations else None)
    df['cdr'] = df['residue_seq_id'].map(assign_cdr_number)

    mhc_df = df.query("chain_type == 'mhc_chain' and residue_seq_id < 1090")
    peptide_df = df.query("chain_type == 'antigen_chain'").copy()
    tcr_cdrs_df = df.query("(chain_type == 'alpha_chain' or chain_type == 'beta_chain') and cdr.notnull()")

    peptide_df_per_res = peptide_df.groupby(['residue_seq_id', 'residue_insert_code', 'residue_name'], dropna=False)
    group_mapping = {group: idx for idx, group in enumerate(peptide_df_per_res.groups.keys(), 1)}

    peptide_df['peptide_length'] = max(group_mapping.values())
    peptide_df['peptide_position'] = peptide_df[['residue_seq_id',
                                                 'residue_insert_code',
                                                 'residue_name']].apply(tuple, axis='columns').map(group_mapping)

    tcr_mhc_interface = tcr_cdrs_df.merge(mhc_df, how='cross', suffixes=('_tcr', '_mhc'))
    tcr_mhc_interface['distance'] = np.sqrt(np.square(tcr_mhc_interface['pos_x_tcr']
                                                       - tcr_mhc_interface['pos_x_mhc'])
                                             + np.square(tcr_mhc_interface['pos_y_tcr']
                                                         - tcr_mhc_interface['pos_y_mhc'])
                                             + np.square(tcr_mhc_interface['pos_z_tcr']
                                                         - tcr_mhc_interface['pos_z_mhc']))

    tcr_peptide_interface = tcr_cdrs_df.merge(peptide_df, how='cross', suffixes=('_tcr', '_peptide'))
    tcr_peptide_interface['distance'] = np.sqrt(np.square(tcr_peptide_interface['pos_x_tcr']
                                                          - tcr_peptide_interface['pos_x_peptide'])
                                                + np.square(tcr_peptide_interface['pos_y_tcr']
                                                            - tcr_peptide_interface['pos_y_peptide'])
                                                + np.square(tcr_peptide_interface['pos_z_tcr']
                                                            - tcr_peptide_interface['pos_z_peptide']))

    contacts_tcr_mhc = tcr_mhc_interface.query('distance < @CUTOFF_DISTANCE').copy()
    contacts_tcr_mhc['path'] = path
    contacts_tcr_mhc['interface_type'] = 'tcr_mhc'
    contacts.append(contacts_tcr_mhc)

    contacts_tcr_peptide = tcr_peptide_interface.query('distance < @CUTOFF_DISTANCE').copy()
    contacts_tcr_peptide['path'] = path
    contacts_tcr_peptide['interface_type'] = 'tcr_peptide'
    contacts.append(contacts_tcr_peptide)

contacts = pd.concat(contacts)
contacts

[5]:

	record_type_tcr	atom_number_tcr	atom_name_tcr	alt_loc_tcr	residue_name_tcr	chain_id_tcr	residue_seq_id_tcr	residue_insert_code_tcr	pos_x_tcr	pos_y_tcr	...	pos_y_peptide	pos_z_peptide	occupancy_peptide	b_factor_peptide	element_peptide	charge_peptide	chain_type_peptide	cdr_peptide	peptide_length	peptide_position
69473	ATOM	472	C	None	ASN	E	57	None	12.312	-60.161	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
69474	ATOM	472	C	None	ASN	E	57	None	12.312	-60.161	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
70937	ATOM	473	O	None	ASN	E	57	None	11.507	-60.966	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
70938	ATOM	473	O	None	ASN	E	57	None	11.507	-60.966	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
70939	ATOM	473	O	None	ASN	E	57	None	11.507	-60.966	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
31515	ATOM	10078	CD2	None	TYR	N	113	None	-51.375	74.966	...	73.846	-3.678	1.0	63.86	O	None	antigen_chain	NaN	10.0	4.0
31516	ATOM	10078	CD2	None	TYR	N	113	None	-51.375	74.966	...	75.095	-6.148	1.0	63.91	C	None	antigen_chain	NaN	10.0	4.0
31517	ATOM	10078	CD2	None	TYR	N	113	None	-51.375	74.966	...	76.511	-6.671	1.0	63.85	C	None	antigen_chain	NaN	10.0	4.0
31697	ATOM	10080	CE2	None	TYR	N	113	None	-52.313	73.948	...	73.846	-3.678	1.0	63.86	O	None	antigen_chain	NaN	10.0	4.0
31698	ATOM	10080	CE2	None	TYR	N	113	None	-52.313	73.948	...	75.095	-6.148	1.0	63.91	C	None	antigen_chain	NaN	10.0	4.0

156312 rows × 56 columns

[6]:

contacts['pdb_id'] = contacts['path'].map(lambda path: path.split('/')[-1].split('.')[0])

[7]:

def create_resi(seq_id: float, insert_code: str | float) -> str | None:
    if pd.isnull(seq_id):
        return None

    return str(int(seq_id)) + (insert_code if not pd.isnull(insert_code) else '')

contacts['resi_mhc'] = contacts.apply(lambda row: create_resi(row.residue_seq_id_mhc, row.residue_insert_code_mhc),
                                      axis='columns')
contacts['resi_tcr'] = contacts.apply(lambda row: create_resi(row.residue_seq_id_tcr, row.residue_insert_code_tcr),
                                      axis='columns')

[8]:

contacts.loc[contacts['peptide_position'].notnull(),
             'peptide_position'] = contacts.loc[contacts['peptide_position'].notnull(),
                                                'peptide_position'].apply(int).apply(str)

[9]:

contacts['cdr_name'] = contacts.apply(
    lambda row: f"CDR-{'A' if row.chain_type_tcr == 'alpha_chain' else 'B'}{int(row.cdr_tcr)}",
    axis='columns',
)

[10]:

contacts['cdr_name'] = pd.Categorical(contacts['cdr_name'],
                                      ['CDR-A1', 'CDR-A2', 'CDR-A3', 'CDR-B1', 'CDR-B2', 'CDR-B3'])

Visualising contact maps

[11]:

palette = sns.color_palette(['#' + colour.lower()
                             for colour in ['DC91BE','C0C0C2', '6490C7', '963E87','2D2E62','56BBD3']], 6)

TCR Contacts on MHC Molecules

[12]:

plt.figure(figsize=(12, 5))

ax = sns.histplot(contacts.query("interface_type == 'tcr_mhc'").sort_values('residue_seq_id_mhc'),
                  x='resi_mhc',
                  hue='cdr_name',
                  multiple='stack',
                  stat='percent',
                  palette=palette)

ax.legend_.set_title(title='CDR Type')
ax.set_xlabel('IMGT Residue Number')
ax.set_ylabel('Percent of total MHC contacts per residue')
plt.xticks(rotation=90)

plt.show()

../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_17_0.png

[13]:

mhc_labels = contacts[
    ['residue_seq_id_mhc', 'residue_insert_code_mhc', 'resi_mhc']
].fillna('').sort_values(['residue_seq_id_mhc', 'residue_insert_code_mhc'])['resi_mhc'].unique().tolist()

tcr_labels = contacts[
    ['residue_seq_id_tcr', 'residue_insert_code_tcr', 'resi_tcr']
    ].fillna('').sort_values(['residue_seq_id_tcr', 'residue_insert_code_tcr'])['resi_tcr'].unique().tolist()

contacts['resi_mhc'] = pd.Categorical(contacts['resi_mhc'], mhc_labels)
contacts['resi_tcr'] = pd.Categorical(contacts['resi_tcr'], tcr_labels)

[14]:

plt.figure(figsize=(12, 10))

ax = sns.histplot(contacts.query("interface_type == 'tcr_mhc'"),
                  x='resi_mhc', y='resi_tcr',
                  hue='cdr_name',
                  palette=sns.color_palette(palette, 6))

ax.legend_.set_title(title='CDR Type')
ax.set_xlabel('MHC IMGT Residue Number')
ax.set_ylabel('TCR IMGT Residue Number')
plt.xticks(rotation=90)

plt.show()

../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_19_0.png

TCR Contacts on Peptides

[15]:

ax = sns.histplot(contacts.query("interface_type == 'tcr_peptide'").sort_values('residue_seq_id_peptide'),
                  x='peptide_position',
                  hue='cdr_name',
                  multiple='stack',
                  stat='percent',
                  palette=sns.color_palette(palette, 6))

ax.set_title('TCR Contacts on the Peptide')
ax.set_xlabel('Peptide Position')
ax.set_ylabel('Percent of total peptide contacts per residue')
ax.legend_.set_title(title='CDR Type')

plt.show()

../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_21_0.png

[16]:

sns.histplot(contacts.query("interface_type == 'tcr_peptide'").drop_duplicates('path'), x='peptide_length')

[16]:

<AxesSubplot: xlabel='peptide_length', ylabel='Count'>

../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_22_1.png

The mode of peptide lengths in 9-mers!

[17]:

ax = sns.histplot(contacts.query("interface_type == 'tcr_peptide' and peptide_length == 9.0").sort_values('residue_seq_id_peptide'),
                  x='peptide_position',
                  hue='cdr_name',
                  multiple='stack',
                  stat='percent',
                  palette=sns.color_palette(palette, 6))

ax.set_title('TCR Contacts on the nonamer-peptides')
ax.set_xlabel('Peptide Position')
ax.set_ylabel('Percent of total peptide contacts per residue')
ax.legend_.set_title(title='CDR Type')

plt.show()

../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_24_0.png

For the nonamers, which is the dominant contacting loop at each peptide position?

[18]:

def select_dominant(group: pd.DataFrame) -> str:
    return group.sort_values('count', ascending=False).iloc[0]['cdr_name']

nonanmer_peptide_tcr_contact_counts = (contacts.query("interface_type == 'tcr_peptide' and peptide_length == 9.0")
                                               .value_counts(['peptide_position', 'cdr_name']))
nonanmer_peptide_tcr_contact_counts.name = 'count'
nonanmer_peptide_tcr_contact_counts = nonanmer_peptide_tcr_contact_counts.reset_index()

nonanmer_peptide_tcr_contact_counts.groupby('peptide_position').apply(select_dominant)

[18]:

peptide_position
1    CDR-A3
2    CDR-A3
3    CDR-A3
4    CDR-A3
5    CDR-A3
6    CDR-B3
7    CDR-B3
8    CDR-B3
9    CDR-B3
dtype: object

Do the non-nonamers have the same trend?

[19]:

ax = sns.histplot(contacts.query("interface_type == 'tcr_peptide' and peptide_length != 9.0").sort_values('residue_seq_id_peptide'),
                  x='peptide_position',
                  hue='cdr_name',
                  multiple='stack',
                  stat='percent',
                  palette=sns.color_palette(palette, 6))

ax.set_title('TCR Contacts on the non-nonamer-peptides')
ax.set_xlabel('Peptide Position')
ax.set_ylabel('Percent of total peptide contacts per residue')
ax.legend_.set_title(title='CDR Type')

plt.show()

../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_28_0.png

Yes, it seems that they do.

Exporting residue list

[20]:

contact_positions = contacts.value_counts(['cdr_name', 'resi_mhc'])
contact_positions.name = 'count'
contact_positions = contact_positions.to_frame().reset_index().sort_values('resi_mhc')

contact_positions.to_csv('../data/processed/mhc_contacts.csv', index=False)