import os from rdkit import Chem import torch import numpy as np # ------------------------------------------------------------------------------ # Computational # ------------------------------------------------------------------------------ FLOAT_TYPE = torch.float32 INT_TYPE = torch.int64 # ------------------------------------------------------------------------------ # Type encoding/decoding # ------------------------------------------------------------------------------ atom_dict = os.environ.get('ATOM_DICT') if atom_dict == 'simple': atom_encoder = {'C': 0, 'N': 1, 'O': 2, 'S': 3, 'B': 4, 'Br': 5, 'Cl': 6, 'P': 7, 'I': 8, 'F': 9, 'NOATOM': 10} atom_decoder = ['C', 'N', 'O', 'S', 'B', 'Br', 'Cl', 'P', 'I', 'F', 'NOATOM'] else: atom_encoder = {'C': 0, 'N': 1, 'O': 2, 'S': 3, 'B': 4, 'Br': 5, 'Cl': 6, 'P': 7, 'I': 8, 'F': 9, 'NH': 10, 'N+': 11, 'O-': 12, 'NOATOM': 13} atom_decoder = ['C', 'N', 'O', 'S', 'B', 'Br', 'Cl', 'P', 'I', 'F', 'NH', 'N+', 'O-', 'NOATOM'] bond_encoder = {"NOBOND": 0, "SINGLE": 1, "DOUBLE": 2, "TRIPLE": 3, 'AROMATIC': 4} bond_decoder = [None, Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC] aa_encoder = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19} aa_decoder = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'] residue_encoder = {'CA': 0, 'SS': 1} residue_decoder = ['CA', 'SS'] residue_bond_encoder = {'CA-CA': 0, 'CA-SS': 1, 'NOBOND': 2} residue_bond_decoder = ['CA-CA', 'CA-SS', None] # aa_atom_index = { # 'A': {'N': 0, 'C': 1, 'O': 2, 'CB': 3}, # 'C': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'SG': 4}, # 'D': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'OD1': 5, 'OD2': 6}, # 'E': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5, 'OE1': 6, 'OE2': 7}, # 'F': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD1': 5, 'CD2': 6, 'CE1': 7, 'CE2': 8, 'CZ': 9}, # 'G': {'N': 0, 'C': 1, 'O': 2}, # 'H': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'ND1': 5, 'CD2': 6, 'CE1': 7, 'NE2': 8}, # 'I': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG1': 4, 'CG2': 5, 'CD1': 6}, # 'K': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5, 'CE': 6, 'NZ': 7}, # 'L': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD1': 5, 'CD2': 6}, # 'M': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'SD': 5, 'CE': 6}, # 'N': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'OD1': 5, 'ND2': 6}, # 'P': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5}, # 'Q': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5, 'OE1': 6, 'NE2': 7}, # 'R': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5, 'NE': 6, 'CZ': 7, 'NH1': 8, 'NH2': 9}, # 'S': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'OG': 4}, # 'T': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'OG1': 4, 'CG2': 5}, # 'V': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG1': 4, 'CG2': 5}, # 'W': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD1': 5, 'CD2': 6, 'NE1': 7, 'CE2': 8, 'CE3': 9, 'CZ2': 10, 'CZ3': 11, 'CH2': 12}, # 'Y': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD1': 5, 'CD2': 6, 'CE1': 7, 'CE2': 8, 'CZ': 9, 'OH': 10}, # } aa_atom_index = { 'A': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4}, 'C': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'SG': 5}, 'D': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'OD1': 6, 'OD2': 7}, 'E': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6, 'OE1': 7, 'OE2': 8}, 'F': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD1': 6, 'CD2': 7, 'CE1': 8, 'CE2': 9, 'CZ': 10}, 'G': {'N': 0, 'CA': 1, 'C': 2, 'O': 3}, 'H': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'ND1': 6, 'CD2': 7, 'CE1': 8, 'NE2': 9}, 'I': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG1': 5, 'CG2': 6, 'CD1': 7}, 'K': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6, 'CE': 7, 'NZ': 8}, 'L': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD1': 6, 'CD2': 7}, 'M': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'SD': 6, 'CE': 7}, 'N': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'OD1': 6, 'ND2': 7}, 'P': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6}, 'Q': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6, 'OE1': 7, 'NE2': 8}, 'R': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6, 'NE': 7, 'CZ': 8, 'NH1': 9, 'NH2': 10}, 'S': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'OG': 5}, 'T': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'OG1': 5, 'CG2': 6}, 'V': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG1': 5, 'CG2': 6}, 'W': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD1': 6, 'CD2': 7, 'NE1': 8, 'CE2': 9, 'CE3': 10, 'CZ2': 11, 'CZ3': 12, 'CH2': 13}, 'Y': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD1': 6, 'CD2': 7, 'CE1': 8, 'CE2': 9, 'CZ': 10, 'OH': 11}, } # ------------------------------------------------------------------------------ # NERF # ------------------------------------------------------------------------------ # indicates whether atom exists aa_atom_mask = { 'A': [True, True, True, True, True, False, False, False, False, False, False, False, False, False], 'C': [True, True, True, True, True, True, False, False, False, False, False, False, False, False], 'D': [True, True, True, True, True, True, True, True, False, False, False, False, False, False], 'E': [True, True, True, True, True, True, True, True, True, False, False, False, False, False], 'F': [True, True, True, True, True, True, True, True, True, True, True, False, False, False], 'G': [True, True, True, True, False, False, False, False, False, False, False, False, False, False], 'H': [True, True, True, True, True, True, True, True, True, True, False, False, False, False], 'I': [True, True, True, True, True, True, True, True, False, False, False, False, False, False], 'K': [True, True, True, True, True, True, True, True, True, False, False, False, False, False], 'L': [True, True, True, True, True, True, True, True, False, False, False, False, False, False], 'M': [True, True, True, True, True, True, True, True, False, False, False, False, False, False], 'N': [True, True, True, True, True, True, True, True, False, False, False, False, False, False], 'P': [True, True, True, True, True, True, True, False, False, False, False, False, False, False], 'Q': [True, True, True, True, True, True, True, True, True, False, False, False, False, False], 'R': [True, True, True, True, True, True, True, True, True, True, True, False, False, False], 'S': [True, True, True, True, True, True, False, False, False, False, False, False, False, False], 'T': [True, True, True, True, True, True, True, False, False, False, False, False, False, False], 'V': [True, True, True, True, True, True, True, False, False, False, False, False, False, False], 'W': [True, True, True, True, True, True, True, True, True, True, True, True, True, True], 'Y': [True, True, True, True, True, True, True, True, True, True, True, True, False, False], } # (14, 3) index tensor with atom indices of atoms a, b and c for NERF reconstruction # in principle, columns 1 and 2 can be inferred from column one (immediate predecessor) alone aa_nerf_indices = { 'A': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'C': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'D': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'E': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [6, 5, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'F': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [6, 5, 4], [7, 5, 4], [8, 6, 5], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'G': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'H': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [6, 5, 4], [7, 5, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'I': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [4, 1, 0], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'K': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [7, 6, 5], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'L': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'M': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'N': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'P': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'Q': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [6, 5, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'R': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [7, 6, 5], [8, 7, 6], [8, 7, 6], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'S': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'T': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [4, 1, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'V': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [4, 1, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]], 'W': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [6, 5, 4], [7, 5, 4], [7, 5, 4], [9, 7, 5], [10, 7, 5], [11, 9, 7]], 'Y': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [6, 5, 4], [7, 5, 4], [8, 6, 5], [10, 8, 6], [0, 0, 0], [0, 0, 0]], } # unique id for each rotatable bond (0=chi1, 1=chi, ...) aa_bond_to_chi = { 'A': {}, 'C': {('CA', 'CB'): 0}, 'D': {('CA', 'CB'): 0, ('CB', 'CG'): 1}, 'E': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'CD'): 2}, 'F': {('CA', 'CB'): 0, ('CB', 'CG'): 1}, 'G': {}, 'H': {('CA', 'CB'): 0, ('CB', 'CG'): 1}, 'I': {('CA', 'CB'): 0, ('CB', 'CG2'): 1}, 'K': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'CD'): 2, ('CD', 'CE'): 3}, 'L': {('CA', 'CB'): 0, ('CB', 'CG'): 1}, 'M': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'SD'): 2}, 'N': {('CA', 'CB'): 0, ('CB', 'CG'): 1}, 'P': {}, 'Q': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'CD'): 2}, 'R': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'CD'): 2, ('CD', 'NE'): 3, ('NE', 'CZ'): 4}, 'S': {('CA', 'CB'): 0}, 'T': {('CA', 'CB'): 0}, 'V': {('CA', 'CB'): 0}, 'W': {('CA', 'CB'): 0, ('CB', 'CG'): 1}, 'Y': {('CA', 'CB'): 0, ('CB', 'CG'): 1}, } # index between 0 and 4 to retrieve chi angles, -1 means not a rotatable bond aa_chi_indices = { 'A': [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], 'C': [-1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1], 'D': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1], 'E': [-1, -1, -1, -1, -1, 0, 1, 2, 2, -1, -1, -1, -1, -1], 'F': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1], 'G': [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], 'H': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1], 'I': [-1, -1, -1, -1, -1, 0, 0, 1, -1, -1, -1, -1, -1, -1], 'K': [-1, -1, -1, -1, -1, 0, 1, 2, 3, -1, -1, -1, -1, -1], 'L': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1], 'M': [-1, -1, -1, -1, -1, 0, 1, 2, -1, -1, -1, -1, -1, -1], 'N': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1], 'P': [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], 'Q': [-1, -1, -1, -1, -1, 0, 1, 2, 2, -1, -1, -1, -1, -1], 'R': [-1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 4, -1, -1, -1], 'S': [-1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1], 'T': [-1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -1], 'V': [-1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -1], 'W': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1], 'Y': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1], } # key: chi index (0=chi1, 1=chi, ...); value: index of atom that defines the chi angle (together with its three predecessors) aa_chi_anchor_atom = { 'A': {}, 'C': {0: 5}, 'D': {0: 5, 1: 6}, 'E': {0: 5, 1: 6, 2: 7}, 'F': {0: 5, 1: 6}, 'G': {}, 'H': {0: 5, 1: 6}, 'I': {0: 5, 1: 7}, 'K': {0: 5, 1: 6, 2: 7, 3: 8}, 'L': {0: 5, 1: 6}, 'M': {0: 5, 1: 6, 2: 7}, 'N': {0: 5, 1: 6}, 'P': {}, 'Q': {0: 5, 1: 6, 2: 7}, 'R': {0: 5, 1: 6, 2: 7, 3: 8, 4: 9}, 'S': {0: 5}, 'T': {0: 5}, 'V': {0: 5}, 'W': {0: 5, 1: 6}, 'Y': {0: 5, 1: 6}, } # ------------------------------------------------------------------------------ # Visualization # ------------------------------------------------------------------------------ # PyMOL colors, see: https://pymolwiki.org/index.php/Color_Values#Chemical_element_colours colors_dic = ['#33ff33', '#3333ff', '#ff4d4d', '#e6c540', '#ffb5b5', '#A62929', '#1FF01F', '#ff8000', '#940094', '#B3FFFF', '#b3e3f5'] radius_dic = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3] # ------------------------------------------------------------------------------ # Backbone geometry # Taken from: Bhagavan, N. V., and C. E. Ha. # "Chapter 4-Three-dimensional structure of proteins and disorders of protein misfolding." # Essentials of Medical Biochemistry (2015): 31-51. # https://www.sciencedirect.com/science/article/pii/B978012416687500004X # ------------------------------------------------------------------------------ N_CA_DIST = 1.47 CA_C_DIST = 1.53 N_CA_C_ANGLE = 110 * np.pi / 180 # ------------------------------------------------------------------------------ # Atom radii # ------------------------------------------------------------------------------ # # https://en.wikipedia.org/wiki/Covalent_radius#Radii_for_multiple_bonds # # (2023/04/14) # covalent_radii = {'H': [32, None, None], # 'C': [75, 67, 60], # 'N': [71, 60, 54], # 'O': [63, 57, 53], # 'F': [64, 59, 53], # 'B': [85, 78, 73], # 'Al': [126, 113, 111], # 'Si': [116, 107, 102], # 'P': [111, 102, 94], # 'S': [103, 94, 95], # 'Cl': [99, 95, 93], # 'As': [121, 114, 106], # 'Br': [114, 109, 110], # 'I': [133, 129, 125], # 'Hg': [133, 142, None], # 'Bi': [151, 141, 135]} # source: https://en.wikipedia.org/wiki/Van_der_Waals_radius vdw_radii = {'N': 1.55, 'O': 1.52, 'C': 1.70, 'H': 1.10, 'S': 1.80, 'P': 1.80, 'Se': 1.90, 'K': 2.75, 'Na': 2.27, 'Mg': 1.73, 'Zn': 1.39, 'B': 1.92, 'Br': 1.85, 'Cl': 1.75, 'I': 1.98, 'F': 1.47} WEBDATASET_SHARD_SIZE = 50000 WEBDATASET_VAL_SIZE = 100