{ "cells": [ { "cell_type": "markdown", "id": "20382f65", "metadata": {}, "source": [ "# Comparing *apo* and *holo* CDR loop clustering" ] }, { "cell_type": "markdown", "id": "6b6b6403", "metadata": {}, "source": [ "## Introduction\n", "\n", "In this notebook, we set out to assess whether the CDR loops of TCRs stay in there canonical forms between *apo* and *holo* states or if they disrupt them." ] }, { "cell_type": "code", "execution_count": 1, "id": "a155933e", "metadata": {}, "outputs": [], "source": [ "import itertools\n", "\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "id": "23992f3b", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | name | \n", "cluster | \n", "chain_type | \n", "cdr | \n", "sequence | \n", "cluster_type | \n", "
---|---|---|---|---|---|---|
0 | \n", "7zt2_DE | \n", "12 | \n", "alpha_chain | \n", "1 | \n", "TSGFNG | \n", "pseudo | \n", "
1 | \n", "7zt3_DE | \n", "12 | \n", "alpha_chain | \n", "1 | \n", "TSGFNG | \n", "pseudo | \n", "
2 | \n", "7zt4_DE | \n", "12 | \n", "alpha_chain | \n", "1 | \n", "TSGFNG | \n", "pseudo | \n", "
3 | \n", "7zt5_DE | \n", "12 | \n", "alpha_chain | \n", "1 | \n", "TSGFNG | \n", "pseudo | \n", "
4 | \n", "7zt7_DE | \n", "12 | \n", "alpha_chain | \n", "1 | \n", "TSGFNG | \n", "pseudo | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4807 | \n", "6miv_CD | \n", "22 | \n", "beta_chain | \n", "3 | \n", "ASGDEGYTQY | \n", "canonical | \n", "
4808 | \n", "3rtq_CD | \n", "22 | \n", "beta_chain | \n", "3 | \n", "ASGDEGYTQY | \n", "canonical | \n", "
4809 | \n", "3dxa_NO | \n", "noise | \n", "beta_chain | \n", "3 | \n", "ASRYRDDSYNEQF | \n", "NaN | \n", "
4810 | \n", "1d9k_AB | \n", "noise | \n", "beta_chain | \n", "3 | \n", "ASGGQGRAEQF | \n", "NaN | \n", "
4811 | \n", "4gg6_GH | \n", "noise | \n", "beta_chain | \n", "3 | \n", "ASSVAVSAGTYEQY | \n", "NaN | \n", "
4812 rows × 6 columns
\n", "\n", " | file_name | \n", "pdb_id | \n", "structure_type | \n", "state | \n", "alpha_chain | \n", "beta_chain | \n", "antigen_chain | \n", "mhc_chain1 | \n", "mhc_chain2 | \n", "cdr_sequences_collated | \n", "peptide_sequence | \n", "mhc_slug | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1ao7_D-E-C-A-B_tcr_pmhc.pdb | \n", "1ao7 | \n", "tcr_pmhc | \n", "holo | \n", "D | \n", "E | \n", "C | \n", "A | \n", "B | \n", "DRGSQS-IYSNGD-AVTTDSWGKLQ-MNHEY-SVGAGI-ASRPGLA... | \n", "LLFGYPVYV | \n", "hla_a_02_01 | \n", "
1 | \n", "1b0g_C-A-B_pmhc.pdb | \n", "1b0g | \n", "pmhc | \n", "apo | \n", "NaN | \n", "NaN | \n", "C | \n", "A | \n", "B | \n", "NaN | \n", "ALWGFFPVL | \n", "hla_a_02_01 | \n", "
2 | \n", "1b0g_F-D-E_pmhc.pdb | \n", "1b0g | \n", "pmhc | \n", "apo | \n", "NaN | \n", "NaN | \n", "F | \n", "D | \n", "E | \n", "NaN | \n", "ALWGFFPVL | \n", "hla_a_02_01 | \n", "
3 | \n", "1bd2_D-E-C-A-B_tcr_pmhc.pdb | \n", "1bd2 | \n", "tcr_pmhc | \n", "holo | \n", "D | \n", "E | \n", "C | \n", "A | \n", "B | \n", "NSMFDY-ISSIKDK-AAMEGAQKLV-MNHEY-SVGAGI-ASSYPGG... | \n", "LLFGYPVYV | \n", "hla_a_02_01 | \n", "
4 | \n", "1bii_P-A-B_pmhc.pdb | \n", "1bii | \n", "pmhc | \n", "apo | \n", "NaN | \n", "NaN | \n", "P | \n", "A | \n", "B | \n", "NaN | \n", "RGPGRAFVTI | \n", "h2_dd | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
386 | \n", "7rtd_C-A-B_pmhc.pdb | \n", "7rtd | \n", "pmhc | \n", "apo | \n", "NaN | \n", "NaN | \n", "C | \n", "A | \n", "B | \n", "NaN | \n", "YLQPRTFLL | \n", "hla_a_02_01 | \n", "
387 | \n", "7rtr_D-E-C-A-B_tcr_pmhc.pdb | \n", "7rtr | \n", "tcr_pmhc | \n", "holo | \n", "D | \n", "E | \n", "C | \n", "A | \n", "B | \n", "DRGSQS-IYSNGD-AVNRDDKII-SEHNR-FQNEAQ-ASSPDIEQY | \n", "YLQPRTFLL | \n", "hla_a_02_01 | \n", "
388 | \n", "8gvb_A-B-P-H-L_tcr_pmhc.pdb | \n", "8gvb | \n", "tcr_pmhc | \n", "holo | \n", "A | \n", "B | \n", "P | \n", "H | \n", "L | \n", "YGATPY-YFSGDTLV-AVGFTGGGNKLT-SEHNR-FQNEAQ-ASSD... | \n", "RYPLTFGW | \n", "hla_a_24_02 | \n", "
389 | \n", "8gvg_A-B-P-H-L_tcr_pmhc.pdb | \n", "8gvg | \n", "tcr_pmhc | \n", "holo | \n", "A | \n", "B | \n", "P | \n", "H | \n", "L | \n", "YGATPY-YFSGDTLV-AVGFTGGGNKLT-SEHNR-FQNEAQ-ASSD... | \n", "RFPLTFGW | \n", "hla_a_24_02 | \n", "
390 | \n", "8gvi_A-B-P-H-L_tcr_pmhc.pdb | \n", "8gvi | \n", "tcr_pmhc | \n", "holo | \n", "A | \n", "B | \n", "P | \n", "H | \n", "L | \n", "YGATPY-YFSGDTLV-AVVFTGGGNKLT-SEHNR-FQNEAQ-ASSL... | \n", "RYPLTFGW | \n", "hla_a_24_02 | \n", "
391 rows × 12 columns
\n", "\n", " | name | \n", "cluster | \n", "chain_type | \n", "cdr | \n", "sequence | \n", "cluster_type | \n", "pdb_id | \n", "chains | \n", "alpha_chain | \n", "beta_chain | \n", "file_name | \n", "structure_type | \n", "state | \n", "antigen_chain | \n", "mhc_chain1 | \n", "mhc_chain2 | \n", "cdr_sequences_collated | \n", "peptide_sequence | \n", "mhc_slug | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "2ak4_DE | \n", "4 | \n", "alpha_chain | \n", "1 | \n", "TRDTTYY | \n", "canonical | \n", "2ak4 | \n", "DE | \n", "D | \n", "E | \n", "2ak4_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "TRDTTYY-RNSFDEQN-ALSGFYNTDKLI-MNHNS-SASEGT-ASP... | \n", "LPEPLPQGQLTAY | \n", "hla_b_35_08 | \n", "
1 | \n", "2ak4_DE | \n", "6 | \n", "beta_chain | \n", "1 | \n", "MNHNS | \n", "canonical | \n", "2ak4 | \n", "DE | \n", "D | \n", "E | \n", "2ak4_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "TRDTTYY-RNSFDEQN-ALSGFYNTDKLI-MNHNS-SASEGT-ASP... | \n", "LPEPLPQGQLTAY | \n", "hla_b_35_08 | \n", "
2 | \n", "2ak4_DE | \n", "6 | \n", "alpha_chain | \n", "2 | \n", "RNSFDEQN | \n", "pseudo | \n", "2ak4 | \n", "DE | \n", "D | \n", "E | \n", "2ak4_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "TRDTTYY-RNSFDEQN-ALSGFYNTDKLI-MNHNS-SASEGT-ASP... | \n", "LPEPLPQGQLTAY | \n", "hla_b_35_08 | \n", "
3 | \n", "2ak4_DE | \n", "noise | \n", "beta_chain | \n", "2 | \n", "SASEGT | \n", "NaN | \n", "2ak4 | \n", "DE | \n", "D | \n", "E | \n", "2ak4_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "TRDTTYY-RNSFDEQN-ALSGFYNTDKLI-MNHNS-SASEGT-ASP... | \n", "LPEPLPQGQLTAY | \n", "hla_b_35_08 | \n", "
4 | \n", "2ak4_DE | \n", "12 | \n", "alpha_chain | \n", "3 | \n", "ALSGFYNTDKLI | \n", "pseudo | \n", "2ak4 | \n", "DE | \n", "D | \n", "E | \n", "2ak4_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "TRDTTYY-RNSFDEQN-ALSGFYNTDKLI-MNHNS-SASEGT-ASP... | \n", "LPEPLPQGQLTAY | \n", "hla_b_35_08 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1105 | \n", "4jff_DE | \n", "0 | \n", "beta_chain | \n", "1 | \n", "GTSNPN | \n", "pseudo | \n", "4jff | \n", "DE | \n", "D | \n", "E | \n", "4jff_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "FLGSQS-TYREGD-AVNDGGRLT-GTSNPN-WGPFG-AWSETGLGM... | \n", "ELAGIGILTV | \n", "hla_a_02_01 | \n", "
1106 | \n", "4jff_DE | \n", "17 | \n", "alpha_chain | \n", "2 | \n", "TYREGD | \n", "canonical | \n", "4jff | \n", "DE | \n", "D | \n", "E | \n", "4jff_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "FLGSQS-TYREGD-AVNDGGRLT-GTSNPN-WGPFG-AWSETGLGM... | \n", "ELAGIGILTV | \n", "hla_a_02_01 | \n", "
1107 | \n", "4jff_DE | \n", "noise | \n", "beta_chain | \n", "2 | \n", "WGPFG | \n", "NaN | \n", "4jff | \n", "DE | \n", "D | \n", "E | \n", "4jff_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "FLGSQS-TYREGD-AVNDGGRLT-GTSNPN-WGPFG-AWSETGLGM... | \n", "ELAGIGILTV | \n", "hla_a_02_01 | \n", "
1108 | \n", "4jff_DE | \n", "26 | \n", "alpha_chain | \n", "3 | \n", "AVNDGGRLT | \n", "canonical | \n", "4jff | \n", "DE | \n", "D | \n", "E | \n", "4jff_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "FLGSQS-TYREGD-AVNDGGRLT-GTSNPN-WGPFG-AWSETGLGM... | \n", "ELAGIGILTV | \n", "hla_a_02_01 | \n", "
1109 | \n", "4jff_DE | \n", "noise | \n", "beta_chain | \n", "3 | \n", "AWSETGLGMGGWQ | \n", "NaN | \n", "4jff | \n", "DE | \n", "D | \n", "E | \n", "4jff_D-E-C-A-B_tcr_pmhc.pdb | \n", "tcr_pmhc | \n", "holo | \n", "C | \n", "A | \n", "B | \n", "FLGSQS-TYREGD-AVNDGGRLT-GTSNPN-WGPFG-AWSETGLGM... | \n", "ELAGIGILTV | \n", "hla_a_02_01 | \n", "
1110 rows × 19 columns
\n", "\n", " | cdr_sequences_collated | \n", "chain_type | \n", "cdr | \n", "apo_clusters | \n", "holo_clusters | \n", "
---|---|---|---|---|---|
0 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "[13] | \n", "[noise] | \n", "
1 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "2 | \n", "[noise] | \n", "[noise] | \n", "
2 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "3 | \n", "[noise] | \n", "[noise] | \n", "
3 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "beta_chain | \n", "1 | \n", "[4] | \n", "[4] | \n", "
4 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "beta_chain | \n", "2 | \n", "[2] | \n", "[2] | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
511 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "alpha_chain | \n", "2 | \n", "[noise] | \n", "[noise] | \n", "
512 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "alpha_chain | \n", "3 | \n", "[noise] | \n", "[2] | \n", "
513 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "1 | \n", "[4] | \n", "[4] | \n", "
514 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "2 | \n", "[2] | \n", "[2] | \n", "
515 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "[noise] | \n", "[noise] | \n", "
516 rows × 5 columns
\n", "\n", " | cdr_sequences_collated | \n", "chain_type | \n", "cdr | \n", "types | \n", "count | \n", "
---|---|---|---|---|---|
0 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Canonical Cluster Same | \n", "0.0 | \n", "
1 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Canonical Cluster Shift | \n", "0.0 | \n", "
2 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Canonical Cluster to Noise | \n", "1.0 | \n", "
3 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Canonical to Pseudo | \n", "0.0 | \n", "
4 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Noise | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
5671 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Noise to Pseudo Cluster | \n", "0.0 | \n", "
5672 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Pseudo Cluster Same | \n", "0.0 | \n", "
5673 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Pseudo Cluster Shift | \n", "0.0 | \n", "
5674 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Pseudo Cluster to Noise | \n", "0.0 | \n", "
5675 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Pseudo to Canonical | \n", "0.0 | \n", "
5676 rows × 5 columns
\n", "\n", " | cdr_sequences_collated | \n", "chain_type | \n", "cdr | \n", "types | \n", "count | \n", "
---|---|---|---|---|---|
0 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Canonical Cluster Same | \n", "0.0 | \n", "
1 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Canonical Cluster Shift | \n", "0.0 | \n", "
2 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Canonical Cluster to Noise | \n", "1.0 | \n", "
3 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Canonical to Pseudo | \n", "0.0 | \n", "
4 | \n", "ATGYPS-ATKADDK-ALSDPVNDMR-SGHAT-FQNNGV-ASSLRGR... | \n", "alpha_chain | \n", "1 | \n", "Noise | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
5671 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Noise to Pseudo Cluster | \n", "0.0 | \n", "
5672 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Pseudo Cluster Same | \n", "0.0 | \n", "
5673 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Pseudo Cluster Shift | \n", "0.0 | \n", "
5674 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Pseudo Cluster to Noise | \n", "0.0 | \n", "
5675 | \n", "YSGSPE-HISR-ALSGFNNAGNMLT-SGHAT-FQNNGV-ASSLGGA... | \n", "beta_chain | \n", "3 | \n", "Pseudo to Canonical | \n", "0.0 | \n", "
5676 rows × 5 columns
\n", "\n", " | Movement Type | \n", "CDR1\\textalpha{} | \n", "CDR1\\textbeta{} | \n", "CDR2\\textalpha{} | \n", "CDR2\\textbeta{} | \n", "CDR3\\textalpha{} | \n", "CDR3\\textbeta{} | \n", "
---|---|---|---|---|---|---|---|
0 | \n", "Canonical Cluster Same | \n", "66 | \n", "78 | \n", "54 | \n", "38 | \n", "16 | \n", "4 | \n", "
1 | \n", "Canonical Cluster Shift | \n", "- | \n", "11 | \n", "- | \n", "2 | \n", "- | \n", "- | \n", "
2 | \n", "Canonical Cluster to Noise | \n", "1 | \n", "- | \n", "5 | \n", "8 | \n", "- | \n", "- | \n", "
3 | \n", "Canonical to Pseudo | \n", "- | \n", "- | \n", "- | \n", "- | \n", "- | \n", "- | \n", "
4 | \n", "Noise | \n", "3 | \n", "- | \n", "8 | \n", "37 | \n", "14 | \n", "33 | \n", "
5 | \n", "Noise to Canonical Cluster | \n", "5 | \n", "2 | \n", "5 | \n", "2 | \n", "10 | \n", "9 | \n", "
6 | \n", "Noise to Pseudo Cluster | \n", "4 | \n", "- | \n", "- | \n", "4 | \n", "30 | \n", "26 | \n", "
7 | \n", "Pseudo Cluster Same | \n", "9 | \n", "5 | \n", "14 | \n", "- | \n", "26 | \n", "24 | \n", "
8 | \n", "Pseudo Cluster Shift | \n", "- | \n", "- | \n", "- | \n", "- | \n", "- | \n", "- | \n", "
9 | \n", "Pseudo Cluster to Noise | \n", "- | \n", "- | \n", "10 | \n", "5 | \n", "- | \n", "- | \n", "
10 | \n", "Pseudo to Canonical | \n", "8 | \n", "- | \n", "- | \n", "- | \n", "- | \n", "- | \n", "