pMHC movement between apo and holo conformations

Introduction

In this notebook we aim to determine what parts of the pMHC, if any, move when contacted by TCRs. We split the MHC domain into MHC-non-TCR contact and MHC-TCR contact based on previous analysis of the TCR finger print on the MHC molecules.

[1]:
import itertools

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
[2]:
NOISE_LEVEL = 0.5 # Å

Load comparisons

[3]:
results = pd.read_csv('../data/processed/apo-holo-tcr-pmhc-class-I-comparisons/pmhc_tcr_contact_apo_holo.csv')
results
[3]:
complex_id structure_x_name structure_y_name chain_type tcr_contact rmsd
0 3qdg_D-E-C-A-B_tcr_pmhc 1jf1_A-B-C_pmhc.pdb 3qdg_D-E-C-A-B_tcr_pmhc.pdb mhc_chain1 False 0.630013
1 3qdg_D-E-C-A-B_tcr_pmhc 1jf1_A-B-C_pmhc.pdb 3qdg_D-E-C-A-B_tcr_pmhc.pdb mhc_chain1 True 0.582077
2 7ow6_D-E-C-A-B_tcr_pmhc 7ow4_A-B-C_pmhc.pdb 7ow4_D-E-F_pmhc.pdb antigen_chain False 0.349159
3 7ow6_D-E-C-A-B_tcr_pmhc 7ow4_A-B-C_pmhc.pdb 7ow4_D-E-F_pmhc.pdb mhc_chain1 False 0.306333
4 7ow6_D-E-C-A-B_tcr_pmhc 7ow4_A-B-C_pmhc.pdb 7ow4_D-E-F_pmhc.pdb mhc_chain1 True 0.265103
... ... ... ... ... ... ...
1989 7rtr_D-E-C-A-B_tcr_pmhc 7p3d_A-B-C_pmhc.pdb 7rtr_D-E-C-A-B_tcr_pmhc.pdb mhc_chain1 False 0.497769
1990 7rtr_D-E-C-A-B_tcr_pmhc 7p3d_A-B-C_pmhc.pdb 7rtr_D-E-C-A-B_tcr_pmhc.pdb mhc_chain1 True 0.458372
1991 7rtr_D-E-C-A-B_tcr_pmhc 7rtd_A-B-C_pmhc.pdb 7rtr_D-E-C-A-B_tcr_pmhc.pdb antigen_chain False 0.455078
1992 7rtr_D-E-C-A-B_tcr_pmhc 7rtd_A-B-C_pmhc.pdb 7rtr_D-E-C-A-B_tcr_pmhc.pdb mhc_chain1 False 0.449558
1993 7rtr_D-E-C-A-B_tcr_pmhc 7rtd_A-B-C_pmhc.pdb 7rtr_D-E-C-A-B_tcr_pmhc.pdb mhc_chain1 True 0.493225

1994 rows × 6 columns

[4]:
results_holo_holo = pd.read_csv('../data/processed/apo-holo-tcr-pmhc-class-I-comparisons/pmhc_tcr_contact_holo.csv')
[5]:
results_holo_holo = results_holo_holo.query("chain_type == 'mhc_chain1' or chain_type == 'antigen_chain'")
[6]:
results_holo_holo['mhc_slug'] = None
results_holo_holo['peptide_sequence'] = None
[7]:
mhc_pattern = r'^hla|h2'
mhc_complex_ids = results_holo_holo['complex_id'].str.contains(mhc_pattern, regex=True)

mhc_slug_peptides = results_holo_holo[mhc_complex_ids]['complex_id'].str.rsplit('_', n=1)
mhc_slugs = mhc_slug_peptides.map(lambda composite: composite[0])
peptides = mhc_slug_peptides.map(lambda composite: composite[1])

results_holo_holo.loc[mhc_complex_ids, 'mhc_slug'] = mhc_slugs
results_holo_holo.loc[mhc_complex_ids, 'peptide_sequence'] = peptides

results_holo_holo = results_holo_holo[mhc_complex_ids]

Load summary data

[10]:
apo_holo_summary_df = pd.read_csv('../data/processed/apo-holo-tcr-pmhc-class-I/apo_holo_summary.csv')
apo_holo_summary_df
[10]:
file_name pdb_id structure_type state alpha_chain beta_chain antigen_chain mhc_chain1 mhc_chain2 cdr_sequences_collated peptide_sequence mhc_slug
0 1ao7_D-E-C-A-B_tcr_pmhc.pdb 1ao7 tcr_pmhc holo D E C A B DRGSQS-IYSNGD-AVTTDSWGKLQ-MNHEY-SVGAGI-ASRPGLA... LLFGYPVYV hla_a_02_01
1 1bd2_D-E-C-A-B_tcr_pmhc.pdb 1bd2 tcr_pmhc holo D E C A B NSMFDY-ISSIKDK-AAMEGAQKLV-MNHEY-SVGAGI-ASSYPGG... LLFGYPVYV hla_a_02_01
2 1bii_A-B-P_pmhc.pdb 1bii pmhc apo NaN NaN P A B NaN RGPGRAFVTI h2_dd
3 1ddh_A-B-P_pmhc.pdb 1ddh pmhc apo NaN NaN P A B NaN RGPGRAFVTI h2_dd
4 1duz_A-B-C_pmhc.pdb 1duz pmhc apo NaN NaN C A B NaN LLFGYPVYV hla_a_02_01
... ... ... ... ... ... ... ... ... ... ... ... ...
353 8gon_D-E-C-A-B_tcr_pmhc.pdb 8gon tcr_pmhc holo D E C A B TSESDYY-QEAYKQQN-ASSGNTPLV-SGHNS-FNNNVP-ASTWGR... NaN NaN
354 8gop_A-B_tcr.pdb 8gop tcr apo A B NaN NaN NaN TSESDYY-QEAYKQQN-ASSGNTPLV-SGHNS-FNNNVP-ASTWGR... NaN NaN
355 8gvb_A-B-P-H-L_tcr_pmhc.pdb 8gvb tcr_pmhc holo A B P H L YGATPY-YFSGDTLV-AVGFTGGGNKLT-SEHNR-FQNEAQ-ASSD... RYPLTFGW hla_a_24_02
356 8gvg_A-B-P-H-L_tcr_pmhc.pdb 8gvg tcr_pmhc holo A B P H L YGATPY-YFSGDTLV-AVGFTGGGNKLT-SEHNR-FQNEAQ-ASSD... RFPLTFGW hla_a_24_02
357 8gvi_A-B-P-H-L_tcr_pmhc.pdb 8gvi tcr_pmhc holo A B P H L YGATPY-YFSGDTLV-AVVFTGGGNKLT-SEHNR-FQNEAQ-ASSL... RYPLTFGW hla_a_24_02

358 rows × 12 columns

[11]:
apo_holo_summary_df['ids'] = apo_holo_summary_df['file_name'].str.replace('.pdb$', '')
/var/scratch/bmcmaste/1757795/ipykernel_1746162/477882467.py:1: FutureWarning: The default value of regex will change from True to False in a future version.
  apo_holo_summary_df['ids'] = apo_holo_summary_df['file_name'].str.replace('.pdb$', '')

Annotate results with summary data

[12]:
results = results.merge(apo_holo_summary_df[['ids', 'cdr_sequences_collated', 'peptide_sequence', 'mhc_slug']],
                        left_on='complex_id',
                        right_on='ids',
                        how='left')
[13]:
results = pd.concat([results, results_holo_holo])
[14]:
results = results.merge(
    apo_holo_summary_df[['file_name', 'pdb_id', 'structure_type', 'state']],
    how='left',
    left_on='structure_x_name',
    right_on='file_name',
).merge(
    apo_holo_summary_df[['file_name', 'pdb_id', 'structure_type', 'state']],
    how='left',
    left_on='structure_y_name',
    right_on='file_name',
)
[16]:
def name_domain(chain_type: str, tcr_contact: bool) -> str | None:
    match chain_type:
        case 'antigen_chain':
            return 'peptide'

        case 'mhc_chain1':
            if tcr_contact:
                return 'mhc_tcr_contact'

            return 'mhc'

    return None

results['domain'] = results.apply(lambda row: name_domain(row.chain_type, row.tcr_contact), axis='columns')
[18]:
results['comparison'] = results['state_x'] + '-' + results['state_y']
results['comparison'] = results['comparison'].map(lambda entry: 'apo-holo' if entry == 'holo-apo' else entry)
[19]:
results['structure_comparison'] = results.apply(
    lambda row: '-'.join(sorted([row.structure_x_name, row.structure_y_name])),
    axis='columns',
)
results = results.drop_duplicates(['structure_comparison', 'domain'])
[21]:
results = results.groupby(['peptide_sequence',
                           'mhc_slug',
                           'comparison',
                           'domain'], dropna=False)['rmsd'].mean().reset_index()
[23]:
apo_holo_results = results.query("comparison == 'apo-holo'")
[24]:
apo_holo_results
[24]:
peptide_sequence mhc_slug comparison domain rmsd
3 AAGIGILTV hla_a_02_01 apo-holo mhc 0.660376
4 AAGIGILTV hla_a_02_01 apo-holo mhc_tcr_contact 0.562211
5 AAGIGILTV hla_a_02_01 apo-holo peptide 1.025622
9 ALHGGWTTK hla_a_03_01 apo-holo mhc 0.442792
10 ALHGGWTTK hla_a_03_01 apo-holo mhc_tcr_contact 0.990352
... ... ... ... ... ...
339 YGFRNVVHI h2_db apo-holo mhc_tcr_contact 0.829133
340 YGFRNVVHI h2_db apo-holo peptide 0.275006
344 YLQPRTFLL hla_a_02_01 apo-holo mhc 0.647465
345 YLQPRTFLL hla_a_02_01 apo-holo mhc_tcr_contact 0.476928
346 YLQPRTFLL hla_a_02_01 apo-holo peptide 0.512357

176 rows × 5 columns

Visualise results

[25]:
sns.boxplot(apo_holo_results, x='domain', y='rmsd')

x = np.linspace(-0.75, 2.75)
y = np.repeat(NOISE_LEVEL, len(x))

plt.plot(x, y, '--r')
plt.xlim(-0.75, 2.75)
../_images/source_pMHC_movement_between_apo_and_holo_conformations_24_0.png

Compute Statistics

[26]:
sns.displot(apo_holo_results, x='rmsd', col='domain')
[26]:
<seaborn.axisgrid.FacetGrid at 0x7f4d5adf9f60>
../_images/source_pMHC_movement_between_apo_and_holo_conformations_26_1.png
[27]:
conditions = {domain: apo_holo_results[apo_holo_results['domain'] == domain]['rmsd'].values for domain in apo_holo_results['domain'].unique()}
[28]:
scipy.stats.f_oneway(*conditions.values())
[28]:
F_onewayResult(statistic=8.43958931017577, pvalue=0.0003181981364846825)
[29]:
combos = list(itertools.combinations(conditions.items(), 2))

significane_level = 0.05 / len(combos)

ad_hoc_results = {'condition_x': [], 'condition_y': [], 'stat': [], 'p_val': []}
for (condition_x_name, condition_x), (condition_y_name, condition_y) in combos:
    ad_hoc_results['condition_x'].append(condition_x_name)
    ad_hoc_results['condition_y'].append(condition_y_name)

    stat, p_val = scipy.stats.ttest_ind(condition_x, condition_y)

    ad_hoc_results['stat'].append(stat)
    ad_hoc_results['p_val'].append(p_val)

ad_hoc_results = pd.DataFrame(ad_hoc_results)

ad_hoc_results['significant'] = ad_hoc_results['p_val'] < significane_level

ad_hoc_results
[29]:
condition_x condition_y stat p_val significant
0 mhc mhc_tcr_contact 2.116838 0.036411 False
1 mhc peptide -2.531661 0.012702 True
2 mhc_tcr_contact peptide -3.322653 0.001196 True

Based on these results it seems that the peptide undergoes conformational change but not the MHC molecule- either TCR-contacting or not.

Comparing the differences in binding between movement types

[30]:
sns.catplot(results,
            x='comparison', y='rmsd',
            col='domain',
            kind='box')
../_images/source_pMHC_movement_between_apo_and_holo_conformations_32_0.png
[31]:
sns.displot(results,
            hue='comparison', x='rmsd',
            col='domain',
            kind='kde')
[31]:
<seaborn.axisgrid.FacetGrid at 0x7f4d58acfe80>
../_images/source_pMHC_movement_between_apo_and_holo_conformations_33_1.png
[32]:
treatment_options = ['comparison', 'domain']
treatments = [(group, df['rmsd'].to_numpy()) for group, df in results.groupby(treatment_options)]
treatments
print(scipy.stats.kruskal(*[values for _, values in treatments]))
KruskalResult(statistic=89.32014468652255, pvalue=6.389533459621575e-16)
[33]:
combos = []
for pairing in list(itertools.combinations(treatments, 2)):
    domain_x = pairing[0][0][1]
    domain_y = pairing[1][0][1]

    if domain_x == domain_y:
        combos.append(pairing)

significance_level = 0.05 / len(combos)
print(significance_level)
statistics = []
p_vals = []

for ((comparison_x, domain_x), sample_x), ((comparison_y, domain_y), sample_y) in combos:
    stat, p_val = scipy.stats.ranksums(sample_x, sample_y)

    statistics.append(stat)
    p_vals.append(p_val)

momvement_type_statistics = pd.DataFrame({
    'comparison_x': [name for ((name, _,), _), _ in combos],
    'domain_x': [name for ((_, name), _), _ in combos],
    'comparison_y': [name for _, ((name, _), _) in combos],
    'domain_y': [name for _, ((_, name), _) in combos],
    'statistic': statistics,
    'p_val': p_vals,
    'significant': [p_val < significance_level for p_val in p_vals],
})

momvement_type_statistics['p_val'] = momvement_type_statistics['p_val'].map(lambda num: f'{num:.2e}')

momvement_type_statistics
0.005555555555555556
[33]:
comparison_x domain_x comparison_y domain_y statistic p_val significant
0 apo-apo mhc apo-holo mhc -4.766718 1.87e-06 True
1 apo-apo mhc holo-holo mhc -2.895550 3.78e-03 True
2 apo-apo mhc_tcr_contact apo-holo mhc_tcr_contact -5.550086 2.86e-08 True
3 apo-apo mhc_tcr_contact holo-holo mhc_tcr_contact -3.288167 1.01e-03 True
4 apo-apo peptide apo-holo peptide -5.087082 3.64e-07 True
5 apo-apo peptide holo-holo peptide -3.533553 4.10e-04 True
6 apo-holo mhc holo-holo mhc 0.687756 4.92e-01 False
7 apo-holo mhc_tcr_contact holo-holo mhc_tcr_contact 1.409336 1.59e-01 False
8 apo-holo peptide holo-holo peptide 0.228864 8.19e-01 False

Conclusion

This analysis reveals that the peptide undergoes the most change from apo to holo states and the MHC molecule has little change between these states. It does not matter whether it is the TCR contacting portion of the MHC or not. The statistical analysis comparing the differences between comparion types gives inconclusive results.