Identify contact residues on MHC Class I molecules

Introduction

In this notebook, we aim to determine the IMGT positions of MHC molecules that make contact with the CDR loops of a contacting TCR. These contacts are then plotted based on the identity of the TCR loop to create a finger print of the TCRs on MHC molecules. We also look at the contacts made by TCRs on the presented peptide.

[1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from python_pdb.parsers import parse_pdb_to_pandas

from tcr_pmhc_interface_analysis.imgt_numbering import assign_cdr_number
[2]:
CUTOFF_DISTANCE = 5 # Å

Loading TCR:pMHC-I Structures

[3]:
STCRDAB_PATH = '../data/raw/stcrdab'
[4]:
stcrdab_summary = pd.read_csv(os.path.join(STCRDAB_PATH, 'db_summary.dat'), delimiter='\t')

selected_stcrdab = stcrdab_summary.copy()

# Resolution better than 3.50 Å
selected_stcrdab['resolution'] = pd.to_numeric(selected_stcrdab['resolution'], errors='coerce')
selected_stcrdab = selected_stcrdab.query("resolution <= 3.50")

# alpha-beta TCRs
selected_stcrdab = selected_stcrdab.query("TCRtype == 'abTCR'")

# MHC class I
selected_stcrdab = selected_stcrdab.query("mhc_type == 'MH1'")

# peptide antigen
selected_stcrdab = selected_stcrdab.query("antigen_type == 'peptide'")

# General clean: drop columns that don't contain anything useful
selected_stcrdab = selected_stcrdab.loc[:, selected_stcrdab.nunique() > 1]
selected_stcrdab = selected_stcrdab.dropna(axis=1, how='all')

# Reset Index
selected_stcrdab = selected_stcrdab.reset_index(drop=True)

selected_stcrdab
[4]:
pdb Bchain Achain antigen_chain antigen_name mhc_chain1 mhc_chain2 docking_angle beta_subgroup alpha_subgroup ... alpha_organism antigen_organism mhc_chain1_organism mhc_chain2_organism authors resolution method r_free r_factor engineered
0 8gom E D C spike protein s2 A B 39.649 NaN NaN ... homo sapiens severe acute respiratory syndrome coronavirus2 homo sapiens homo sapiens Wu, D., Mariuzza, R.A. 2.783 X-RAY DIFFRACTION 0.248 0.195 True
1 8gon E D C spike protein s2 A B 38.984 NaN NaN ... homo sapiens severe acute respiratory syndrome coronavirus2 homo sapiens homo sapiens Wu, D., Mariuzza, R.A. 2.601 X-RAY DIFFRACTION 0.253 0.198 True
2 7q99 E D C asn-leu-ser-ala-leu-gly-ile-phe-ser-thr A B 46.371 TRBV30 TRAV12 ... homo sapiens homo sapiens homo sapiens homo sapiens Rizkallah, P.J., Sewell, A.K., Wall, A., Fulle... 2.550 X-RAY DIFFRACTION 0.272 0.218 True
3 7q9a E D C leu-leu-leu-gly-ile-gly-ile-leu-val-leu A B 48.391 TRBV30 TRAV12 ... homo sapiens homo sapiens homo sapiens homo sapiens Rizkallah, P.J., Sewell, A.K., Wall, A., Fulle... 2.100 X-RAY DIFFRACTION 0.243 0.205 True
4 2ak4 E D C ebv peptide lpeplpqgqltay A B 71.108 TRBV6 TRAV19 ... homo sapiens NaN homo sapiens homo sapiens Tynan, F.E., Burrows, S.R., Buckle, A.M., Clem... 2.500 X-RAY DIFFRACTION 0.278 0.246 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
297 6q3s E D C ser-leu-leu-met-trp-ile-thr-gln-val A B 67.059 TRBV6 TRAV21 ... homo sapiens homo sapiens homo sapiens homo sapiens Meijers, R., Anjanappa, R., Springer, S., Garc... 2.500 X-RAY DIFFRACTION 0.273 0.229 True
298 5men E D C ile-leu-ala-lys-phe-leu-his-trp-leu A B 40.823 TRBV6 TRAV22 ... homo sapiens homo sapiens homo sapiens homo sapiens Rizkallah, P.J., Lloyd, A., Crowther, M., Cole... 2.810 X-RAY DIFFRACTION 0.272 0.189 True
299 1ao7 E D C tax peptide A B 34.827 TRBV6 TRAV12 ... homo sapiens human t-lymphotropic virus 1 homo sapiens homo sapiens Garboczi, D.N., Ghosh, P., Utz, U., Fan, Q.R.,... 2.600 X-RAY DIFFRACTION 0.320 0.245 True
300 4jff E D C melanoma motif A B 42.977 TRBV30 TRAV12 ... homo sapiens homo sapiens homo sapiens homo sapiens Rizkallah, P.J., Cole, D.K., Madura, F., Sewel... 2.430 X-RAY DIFFRACTION 0.263 0.210 True
301 3dxa O N M ebv decapeptide epitope K L 58.225 TRBV7 TRAV26 ... homo sapiens NaN homo sapiens homo sapiens Archbold, J.K., Macdonald, W.A., Gras, S., Ros... 3.500 X-RAY DIFFRACTION 0.330 0.286 True

302 rows × 24 columns

Determine contacting residues

[5]:
contacts = []

for _, row in selected_stcrdab.iterrows():
    path = os.path.join(STCRDAB_PATH, 'imgt', f'{row.pdb}.pdb')

    with open(path, 'r') as fh:
        df = parse_pdb_to_pandas(fh.read())

    chain_annotations = {}
    chain_annotations[row.Achain] = 'alpha_chain'
    chain_annotations[row.Bchain] = 'beta_chain'
    chain_annotations[row.antigen_chain] = 'antigen_chain'
    chain_annotations[row.mhc_chain1] = 'mhc_chain'

    df['chain_type'] = df['chain_id'].map(lambda id_: chain_annotations[id_] if id_ in chain_annotations else None)
    df['cdr'] = df['residue_seq_id'].map(assign_cdr_number)

    mhc_df = df.query("chain_type == 'mhc_chain' and residue_seq_id < 1090")
    peptide_df = df.query("chain_type == 'antigen_chain'").copy()
    tcr_cdrs_df = df.query("(chain_type == 'alpha_chain' or chain_type == 'beta_chain') and cdr.notnull()")

    peptide_df_per_res = peptide_df.groupby(['residue_seq_id', 'residue_insert_code', 'residue_name'], dropna=False)
    group_mapping = {group: idx for idx, group in enumerate(peptide_df_per_res.groups.keys(), 1)}

    peptide_df['peptide_length'] = max(group_mapping.values())
    peptide_df['peptide_position'] = peptide_df[['residue_seq_id',
                                                 'residue_insert_code',
                                                 'residue_name']].apply(tuple, axis='columns').map(group_mapping)

    tcr_mhc_interface = tcr_cdrs_df.merge(mhc_df, how='cross', suffixes=('_tcr', '_mhc'))
    tcr_mhc_interface['distance'] = np.sqrt(np.square(tcr_mhc_interface['pos_x_tcr']
                                                       - tcr_mhc_interface['pos_x_mhc'])
                                             + np.square(tcr_mhc_interface['pos_y_tcr']
                                                         - tcr_mhc_interface['pos_y_mhc'])
                                             + np.square(tcr_mhc_interface['pos_z_tcr']
                                                         - tcr_mhc_interface['pos_z_mhc']))

    tcr_peptide_interface = tcr_cdrs_df.merge(peptide_df, how='cross', suffixes=('_tcr', '_peptide'))
    tcr_peptide_interface['distance'] = np.sqrt(np.square(tcr_peptide_interface['pos_x_tcr']
                                                          - tcr_peptide_interface['pos_x_peptide'])
                                                + np.square(tcr_peptide_interface['pos_y_tcr']
                                                            - tcr_peptide_interface['pos_y_peptide'])
                                                + np.square(tcr_peptide_interface['pos_z_tcr']
                                                            - tcr_peptide_interface['pos_z_peptide']))

    contacts_tcr_mhc = tcr_mhc_interface.query('distance < @CUTOFF_DISTANCE').copy()
    contacts_tcr_mhc['path'] = path
    contacts_tcr_mhc['interface_type'] = 'tcr_mhc'
    contacts.append(contacts_tcr_mhc)

    contacts_tcr_peptide = tcr_peptide_interface.query('distance < @CUTOFF_DISTANCE').copy()
    contacts_tcr_peptide['path'] = path
    contacts_tcr_peptide['interface_type'] = 'tcr_peptide'
    contacts.append(contacts_tcr_peptide)

contacts = pd.concat(contacts)
contacts
[5]:
record_type_tcr atom_number_tcr atom_name_tcr alt_loc_tcr residue_name_tcr chain_id_tcr residue_seq_id_tcr residue_insert_code_tcr pos_x_tcr pos_y_tcr ... pos_y_peptide pos_z_peptide occupancy_peptide b_factor_peptide element_peptide charge_peptide chain_type_peptide cdr_peptide peptide_length peptide_position
69473 ATOM 472 C None ASN E 57 None 12.312 -60.161 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
69474 ATOM 472 C None ASN E 57 None 12.312 -60.161 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
70937 ATOM 473 O None ASN E 57 None 11.507 -60.966 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
70938 ATOM 473 O None ASN E 57 None 11.507 -60.966 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
70939 ATOM 473 O None ASN E 57 None 11.507 -60.966 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
31515 ATOM 10078 CD2 None TYR N 113 None -51.375 74.966 ... 73.846 -3.678 1.0 63.86 O None antigen_chain NaN 10.0 4.0
31516 ATOM 10078 CD2 None TYR N 113 None -51.375 74.966 ... 75.095 -6.148 1.0 63.91 C None antigen_chain NaN 10.0 4.0
31517 ATOM 10078 CD2 None TYR N 113 None -51.375 74.966 ... 76.511 -6.671 1.0 63.85 C None antigen_chain NaN 10.0 4.0
31697 ATOM 10080 CE2 None TYR N 113 None -52.313 73.948 ... 73.846 -3.678 1.0 63.86 O None antigen_chain NaN 10.0 4.0
31698 ATOM 10080 CE2 None TYR N 113 None -52.313 73.948 ... 75.095 -6.148 1.0 63.91 C None antigen_chain NaN 10.0 4.0

156312 rows × 56 columns

[6]:
contacts['pdb_id'] = contacts['path'].map(lambda path: path.split('/')[-1].split('.')[0])
[7]:
def create_resi(seq_id: float, insert_code: str | float) -> str | None:
    if pd.isnull(seq_id):
        return None

    return str(int(seq_id)) + (insert_code if not pd.isnull(insert_code) else '')

contacts['resi_mhc'] = contacts.apply(lambda row: create_resi(row.residue_seq_id_mhc, row.residue_insert_code_mhc),
                                      axis='columns')
contacts['resi_tcr'] = contacts.apply(lambda row: create_resi(row.residue_seq_id_tcr, row.residue_insert_code_tcr),
                                      axis='columns')
[8]:
contacts.loc[contacts['peptide_position'].notnull(),
             'peptide_position'] = contacts.loc[contacts['peptide_position'].notnull(),
                                                'peptide_position'].apply(int).apply(str)
[9]:
contacts['cdr_name'] = contacts.apply(
    lambda row: f"CDR-{'A' if row.chain_type_tcr == 'alpha_chain' else 'B'}{int(row.cdr_tcr)}",
    axis='columns',
)
[10]:
contacts['cdr_name'] = pd.Categorical(contacts['cdr_name'],
                                      ['CDR-A1', 'CDR-A2', 'CDR-A3', 'CDR-B1', 'CDR-B2', 'CDR-B3'])

Visualising contact maps

[11]:
palette = sns.color_palette(['#' + colour.lower()
                             for colour in ['DC91BE','C0C0C2', '6490C7', '963E87','2D2E62','56BBD3']], 6)

TCR Contacts on MHC Molecules

[12]:
plt.figure(figsize=(12, 5))

ax = sns.histplot(contacts.query("interface_type == 'tcr_mhc'").sort_values('residue_seq_id_mhc'),
                  x='resi_mhc',
                  hue='cdr_name',
                  multiple='stack',
                  stat='percent',
                  palette=palette)

ax.legend_.set_title(title='CDR Type')
ax.set_xlabel('IMGT Residue Number')
ax.set_ylabel('Percent of total MHC contacts per residue')
plt.xticks(rotation=90)

plt.show()
../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_17_0.png
[13]:
mhc_labels = contacts[
    ['residue_seq_id_mhc', 'residue_insert_code_mhc', 'resi_mhc']
].fillna('').sort_values(['residue_seq_id_mhc', 'residue_insert_code_mhc'])['resi_mhc'].unique().tolist()

tcr_labels = contacts[
    ['residue_seq_id_tcr', 'residue_insert_code_tcr', 'resi_tcr']
    ].fillna('').sort_values(['residue_seq_id_tcr', 'residue_insert_code_tcr'])['resi_tcr'].unique().tolist()

contacts['resi_mhc'] = pd.Categorical(contacts['resi_mhc'], mhc_labels)
contacts['resi_tcr'] = pd.Categorical(contacts['resi_tcr'], tcr_labels)
[14]:
plt.figure(figsize=(12, 10))

ax = sns.histplot(contacts.query("interface_type == 'tcr_mhc'"),
                  x='resi_mhc', y='resi_tcr',
                  hue='cdr_name',
                  palette=sns.color_palette(palette, 6))

ax.legend_.set_title(title='CDR Type')
ax.set_xlabel('MHC IMGT Residue Number')
ax.set_ylabel('TCR IMGT Residue Number')
plt.xticks(rotation=90)

plt.show()
../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_19_0.png

TCR Contacts on Peptides

[15]:
ax = sns.histplot(contacts.query("interface_type == 'tcr_peptide'").sort_values('residue_seq_id_peptide'),
                  x='peptide_position',
                  hue='cdr_name',
                  multiple='stack',
                  stat='percent',
                  palette=sns.color_palette(palette, 6))

ax.set_title('TCR Contacts on the Peptide')
ax.set_xlabel('Peptide Position')
ax.set_ylabel('Percent of total peptide contacts per residue')
ax.legend_.set_title(title='CDR Type')

plt.show()
../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_21_0.png
[16]:
sns.histplot(contacts.query("interface_type == 'tcr_peptide'").drop_duplicates('path'), x='peptide_length')
[16]:
<AxesSubplot: xlabel='peptide_length', ylabel='Count'>
../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_22_1.png

The mode of peptide lengths in 9-mers!

[17]:
ax = sns.histplot(contacts.query("interface_type == 'tcr_peptide' and peptide_length == 9.0").sort_values('residue_seq_id_peptide'),
                  x='peptide_position',
                  hue='cdr_name',
                  multiple='stack',
                  stat='percent',
                  palette=sns.color_palette(palette, 6))

ax.set_title('TCR Contacts on the nonamer-peptides')
ax.set_xlabel('Peptide Position')
ax.set_ylabel('Percent of total peptide contacts per residue')
ax.legend_.set_title(title='CDR Type')

plt.show()
../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_24_0.png

For the nonamers, which is the dominant contacting loop at each peptide position?

[18]:
def select_dominant(group: pd.DataFrame) -> str:
    return group.sort_values('count', ascending=False).iloc[0]['cdr_name']

nonanmer_peptide_tcr_contact_counts = (contacts.query("interface_type == 'tcr_peptide' and peptide_length == 9.0")
                                               .value_counts(['peptide_position', 'cdr_name']))
nonanmer_peptide_tcr_contact_counts.name = 'count'
nonanmer_peptide_tcr_contact_counts = nonanmer_peptide_tcr_contact_counts.reset_index()

nonanmer_peptide_tcr_contact_counts.groupby('peptide_position').apply(select_dominant)
[18]:
peptide_position
1    CDR-A3
2    CDR-A3
3    CDR-A3
4    CDR-A3
5    CDR-A3
6    CDR-B3
7    CDR-B3
8    CDR-B3
9    CDR-B3
dtype: object

Do the non-nonamers have the same trend?

[19]:
ax = sns.histplot(contacts.query("interface_type == 'tcr_peptide' and peptide_length != 9.0").sort_values('residue_seq_id_peptide'),
                  x='peptide_position',
                  hue='cdr_name',
                  multiple='stack',
                  stat='percent',
                  palette=sns.color_palette(palette, 6))

ax.set_title('TCR Contacts on the non-nonamer-peptides')
ax.set_xlabel('Peptide Position')
ax.set_ylabel('Percent of total peptide contacts per residue')
ax.legend_.set_title(title='CDR Type')

plt.show()
../_images/source_Identify_contact_residues_on_MHC_Class_I_molecules_28_0.png

Yes, it seems that they do.

Exporting residue list

[20]:
contact_positions = contacts.value_counts(['cdr_name', 'resi_mhc'])
contact_positions.name = 'count'
contact_positions = contact_positions.to_frame().reset_index().sort_values('resi_mhc')

contact_positions.to_csv('../data/processed/mhc_contacts.csv', index=False)