diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml index 4794d69..6ba614f 100644 --- a/.github/workflows/python-build.yml +++ b/.github/workflows/python-build.yml @@ -36,3 +36,13 @@ jobs: run: | pip install wheel python setup.py bdist_wheel + - name: Build source distribution + run: | + python setup.py sdist + - name: Upload wheel and source distribution + uses: actions/upload-artifact@v3 + with: + name: python-package-${{ matrix.python-version }} + path: | + dist/*.whl + dist/*.tar.gz diff --git a/pyproject.toml b/pyproject.toml index b337a8e..4c2efc2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,8 +3,8 @@ requires = ["setuptools", "wheel", "Cython"] build-backend = "setuptools.build_meta" [project] -name = "Aposteriori" -version = "2.1.2" +name = "aposteriori" +version = "2.2.0" requires-python = ">= 3.8" readme = "README.md" dependencies = [ @@ -14,3 +14,6 @@ dependencies = [ "numpy", "flake8" ] + +[project.scripts] +make-frame-dataset = "aposteriori.data_prep.cli:cli" \ No newline at end of file diff --git a/setup.py b/setup.py index 548e9e0..3a8ddf2 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="aposteriori", - version="2.1.2", + version="2.2.0", author="Wells Wood Research Group", author_email="chris.wood@ed.ac.uk", description="A library for the voxelization of protein structures for protein design.", diff --git a/src/aposteriori/config.py b/src/aposteriori/config.py index e03d6cb..ff16341 100644 --- a/src/aposteriori/config.py +++ b/src/aposteriori/config.py @@ -3,17 +3,17 @@ from ampal.data import ELEMENT_DATA # Config paths -MAKE_FRAME_DATASET_VER = "2.1.0" +MAKE_FRAME_DATASET_VER = "2.2.0" PROJECT_ROOT_DIR = pathlib.Path(__file__).parent DATA_FOLDER = PROJECT_ROOT_DIR / "data" DATA_FOLDER.mkdir(parents=True, exist_ok=True) ATOM_COLORS = { # Atomic number : Color - 0: ELEMENT_DATA['C']['CPK'], # Carbon - 1: ELEMENT_DATA['N']['CPK'], # Nitrogen - 2: ELEMENT_DATA['O']['CPK'], # Oxygen + 0: ELEMENT_DATA["C"]["CPK"], # Carbon + 1: ELEMENT_DATA["N"]["CPK"], # Nitrogen + 2: ELEMENT_DATA["O"]["CPK"], # Oxygen 3: "orange", # +1 - 4: "green" # +2 + 4: "green", # +2 } ATOM_VANDERWAAL_RADII = { # Atomic number : Radius @@ -27,5 +27,630 @@ PDB_CODES = ["1qys", "6ct4"] HDF5_STRUCTURES_PATH = DATA_FOLDER / "frame_dataset.hdf5" FETCH_PDB = True -UNCOMMON_RESIDUE_DICT = {"DLY": "LYS", "OTH": "THR", "GHP": "GLY", "YOF": "TYR", "HS9": "HIS", "HVA": "VAL", "C5C": "CYS", "TMD": "THR", "NC1": "SER", "CSR": "CYS", "LYP": "LYS", "PR4": "PRO", "KPI": "LYS", "02K": "ALA", "4AW": "TRP", "MLE": "LEU", "NMM": "ARG", "DNE": "LEU", "NYS": "CYS", "SEE": "SER", "DSG": "ASN", "ALA": "ALA", "CSA": "CYS", "SCH": "CYS", "TQQ": "TRP", "PTM": "TYR", "XPR": "PRO", "VLL": "UNK", "B3Y": "TYR", "PAQ": "TYR", "FME": "MET", "NAL": "ALA", "TYI": "TYR", "OXX": "ASP", "CSS": "CYS", "OCS": "CYS", "193": "UNK", "GLJ": "GLU", "PM3": "PHE", "DTR": "TRP", "MEQ": "GLN", "HSO": "HIS", "TYW": "TYR", "LED": "LEU", "PHL": "PHE", "TDD": "LEU", "MEA": "PHE", "FGA": "GLU", "GGL": "GLU", "PSH": "HIS", "3CF": "PHE", "MSE": "MET", "2SO": "HIS", "B3S": "SER", "PSW": "SEC", "C4R": "CYS", "XCP": "UNK", "LYF": "LYS", "WFP": "PHE", "A8E": "VAL", "0AF": "TRP", "PEC": "CYS", "JJJ": "CYS", "3TY": "UNK", "SVY": "SER", "DIL": "ILE", "MHS": "HIS", "MME": "MET", "MMO": "ARG", "B3A": "ALA", "CHG": "UNK", "PHI": "PHE", "AR2": "ARG", "MND": "ASN", "BTR": "TRP", "AEI": "ASP", "TIH": "ALA", "DDE": "HIS", "S1H": "SER", "DSE": "SER", "AR4": "GLU", "FDL": "LYS", "PRJ": "PRO", "CY3": "CYS", "2TY": "TYR", "AR7": "ARG", "CTH": "THR", "DTY": "TYR", "SYS": "CYS", "C1X": "LYS", "SVV": "SER", "ASN": "ASN", "SNC": "CYS", "AKZ": "ASP", "OMY": "TYR", "JJL": "CYS", "XSN": "ASN", "0UO": "TRP", "TCQ": "TYR", "OSE": "SER", "NPH": "CYS", "0A0": "ASP", "1PA": "PHE", "SIC": "CYS", "TY8": "TYR", "AYA": "ALA", "ALN": "ALA", "SXE": "SER", "B3T": "UNK", "BB9": "CYS", "HL2": "LEU", "0AR": "ARG", "SVA": "SER", "DBB": "THR", "KPY": "LYS", "DPP": "ALA", "32S": "UNK", "FGL": "GLY", "N80": "PRO", "IGL": "GLY", "PF5": "PHE", "OYL": "HIS", "MNL": "LEU", "PBF": "PHE", "CEA": "CYS", "OHI": "HIS", "ESC": "MET", "2JG": "SER", "1X6": "SER", "4BF": "TYR", "MAA": "ALA", "3X9": "CYS", "BFD": "ASP", "CZ2": "CYS", "23P": "ALA", "I4G": "GLY", "CMT": "CYS", "LVN": "VAL", "OAS": "SER", "TY2": "TYR", "SCS": "CYS", "PFX": "UNK", "MF3": "UNK", "OBS": "LYS", "GL3": "GLY", "0A9": "PHE", "MVA": "VAL", "B3Q": "UNK", "DOA": "UNK", "MP8": "PRO", "CYR": "CYS", "5PG": "GLY", "ILY": "LYS", "DNW": "ALA", "BCX": "CYS", "AZK": "LYS", "AAR": "ARG", "TRN": "TRP", "NBQ": "TYR", "RVX": "SER", "PSA": "PHE", "Z3E": "THR", "OCY": "CYS", "2ZC": "SER", "N2C": "UNK", "SBD": "SER", "MSA": "GLY", "SET": "SER", "HS8": "HIS", "SMF": "PHE", "HYP": "PRO", "PYX": "CYS", "XPL": "PYL", "DMK": "ASP", "BIF": "PHE", "M3L": "LYS", "CYF": "CYS", "O12": "UNK", "SRZ": "SER", "LAL": "ALA", "2MR": "ARG", "4PH": "PHE", "2LT": "TYR", "LPL": "UNK", "3YM": "TYR", "LRK": "LYS", "FVA": "VAL", "MED": "MET", "ILM": "ILE", "6CL": "LYS", "CXM": "MET", "DHV": "VAL", "PR3": "CYS", "HAR": "ARG", "KWS": "GLY", "SAR": "GLY", "0LF": "PRO", "45F": "PRO", "12A": "A","CLG": "LYS", "DHI": "HIS", "PTR": "TYR", "DMT": "UNK", "OMT": "MET", "TBG": "VAL", "PLJ": "PRO", "IAM": "ALA", "DBY": "TYR", "CPC": "UNK", "GLZ": "GLY", "4FW": "TRP", "SLZ": "LYS", "HIA": "HIS", "FOE": "CYS", "IYR": "TYR", "KST": "LYS", "B3M": "UNK", "BB6": "CYS", "CYW": "CYS", "MPQ": "GLY", "HHK": "LYS", "HGL": "UNK", "SE7": "ALA", "ELY": "LYS", "TRO": "TRP", "DNP": "ALA", "MK8": "LEU", "200": "PHE", "WVL": "VAL", "LPD": "PRO", "NCB": "ALA", "DDZ": "ALA", "MYK": "LYS", "OLD": "HIS", "DYS": "CYS", "LET": "LYS", "ESB": "TYR", "HR7": "ARG", "DI7": "TYR", "QCS": "CYS", "ASA": "ASP", "CSX": "CYS", "P3Q": "TYR", "OHS": "ASP", "SOY": "SER", "EHP": "PHE", "ZCL": "PHE", "32T": "UNK", "AHB": "ASN", "TRX": "TRP", "0AK": "ASP", "TH5": "THR", "GHG": "GLN", "XW1": "ALA", "23F": "PHE", "1OP": "TYR", "AGT": "CYS", "PYA": "ALA", "2MT": "PRO", "4FB": "PRO", "CSB": "CYS", "TRQ": "TRP", "MDO": "GLY", "CAS": "CYS", "TTQ": "TRP", "T0I": "TYR", "LLY": "LYS", "GVL": "SER", "BPE": "CYS", "0TD": "ASP", "TYY": "TYR", "BH2": "ASP", "D3P": "GLY", "CY4": "CYS", "CHP": "GLY", "DFO": "UNK", "NLB": "LEU", "QPH": "PHE", "DTH": "THR", "LLO": "LYS", "LYN": "LYS", "DPN": "PHE", "EFC": "CYS", "FP9": "PRO", "OMX": "TYR", "AGQ": "TYR", "PHD": "ASP", "PR9": "PRO", "B3L": "UNK", "LYX": "LYS", "IT1": "LYS", "DBU": "THR", "0A8": "CYS", "TYX": "UNK", "QMM": "GLN", "CME": "CYS", "ACB": "ASP", "TRF": "TRP", "HOX": "PHE", "DA2": "ARG", "DNS": "LYS", "BIL": "UNK", "SUN": "SER", "TYJ": "TYR", "3PX": "PRO", "CLD": "SER", "IPG": "GLY", "CLH": "LYS", "XCN": "CYS", "CZZ": "CYS", "THO": "UNK", "CY1": "CYS", "CYS": "CYS", "PFF": "PHE", "MLL": "LEU", "PG1": "SER", "BMT": "THR", "CSZ": "CYS", "DSN": "SER", "NIY": "TYR", "FH7": "LYS", "CGV": "CYS", "SVZ": "SER", "ORQ": "ARG", "DLS": "LYS", "DVA": "VAL", "BHD": "ASP", "TPQ": "TYR", "STY": "TYR", "CSP": "CYS", "31Q": "CYS", "B3E": "GLU", "LEF": "LEU", "GLH": "GLU", "LCK": "LYS", "GME": "GLU", "FHO": "LYS", "MDH": "UNK", "ECC": "GLN", "34E": "VAL", "ASB": "ASP", "HCS": "UNK", "KYN": "TRP", "OIC": "UNK", "VR0": "ARG", "U2X": "TYR", "PHE": "PHE", "TYS": "TYR", "SBG": "SER", "A5N": "ASN", "CYD": "CYS", "4DP": "TRP", "3AH": "HIS", "FCL": "PHE", "PRV": "GLY", "CYQ": "CYS", "MBQ": "TYR", "DAS": "ASP", "CS4": "CYS", "B3K": "LYS", "NLE": "LEU", "143": "CYS", "PR7": "PRO", "DAH": "PHE", "LE1": "VAL", "TQZ": "CYS", "LGY": "LYS", "CML": "CYS", "CSW": "CYS", "N10": "SER", "2RX": "SER", "TOQ": "TRP", "0AH": "SER", "P2Q": "TYR", "CYG": "CYS", "DGL": "GLU", "KOR": "MET", "DAR": "ARG", "2ML": "LEU", "PTH": "TYR", "CCS": "CYS", "HMR": "ARG", "33X": "ALA", "UN2": "UNK", "IML": "ILE", "4CY": "MET", "ZZJ": "ALA", "DFI": "UNK", "TIS": "SER", "LLP": "LYS", "MHU": "PHE", "QPA": "CYS", "175": "GLY", "SAH": "CYS", "IIL": "ILE", "BCS": "CYS", "R4K": "TRP", "TYQ": "TYR", "NCY": "UNK", "FT6": "TRP", "OBF": "UNK", "0CS": "ALA", "4HL": "TYR", "TXY": "TYR", "DOH": "ASP", "CSE": "CYS", "DAB": "ALA", "GLK": "GLU", "TYN": "TYR", "LEI": "VAL", "M0H": "CYS", "CLB": "SER", "MGG": "ARG", "CGU": "GLU", "UF0": "SER", "SLL": "LYS", "ML3": "LYS", "HPH": "PHE", "SME": "MET", "ALC": "ALA", "ASL": "ASP", "CHS": "UNK", "2TL": "THR", "HT7": "TRP", "SGB": "SER", "OPR": "ARG", "B3D": "ASP", "FLT": "TYR", "DGN": "GLN", "4CF": "PHE", "HLU": "LEU", "FZN": "LYS", "C6C": "CYS", "HTI": "CYS", "OMH": "SER", "WLU": "LEU", "23S": "UNK", "U3X": "PHE", "SEB": "SER", "DBZ": "ALA", "BB7": "CYS", "2RA": "ALA", "SCY": "CYS", "6CW": "TRP", "AHP": "ALA", "ARO": "ARG", "RE3": "TRP", "1TQ": "TRP", "VDL": "UNK", "4IN": "TRP", "GFT": "SER", "CPI": "UNK", "LSO": "LYS", "CGA": "GLU", "MLZ": "LYS", "HTR": "TRP", "00C": "CYS", "FAK": "LYS", "PRS": "PRO", "ME0": "MET", "SDP": "SER", "HSL": "SER", "C3Y": "CYS", "823": "ASN", "PHA": "PHE", "LYZ": "LYS", "HTN": "ASN", "LP6": "LYS", "ALV": "ALA", "NVA": "VAL", "CSD": "CYS", "DMH": "ASN", "PG9": "GLY", "PCA": "GLU", "KCX": "LYS", "MDF": "TYR", "TYB": "TYR", "MHL": "LEU", "GNC": "GLN", "NLO": "LEU", "MEN": "ASN", "POM": "PRO", "2HF": "HIS", "CY0": "CYS", "ZYK": "PRO", "R1A": "CYS", "CAF": "CYS", "YCM": "CYS", "ORN": "ALA", "H5M": "PRO", "MLY": "LYS", "KYQ": "LYS", "DPQ": "TYR", "MIS": "SER", "TPO": "THR", "XX1": "LYS", "SMC": "CYS", "DHA": "SER", "MGN": "GLN", "FLA": "ALA", "ILX": "ILE", "QIL": "ILE", "2KP": "LYS", "CS1": "CYS", "HNC": "CYS", "PRK": "LYS", "LYR": "LYS", "DM0": "LYS", "TSY": "CYS", "NYB": "CYS", "MHO": "MET", "KFP": "LYS", "SEN": "SER", "999": "ASP", "VLM": "UNK", "CMH": "CYS", "ONL": "UNK", "M2L": "LYS", "LME": "GLU", "AIB": "ALA", "CYJ": "LYS", "CS3": "CYS", "WPA": "PHE", "MTY": "TYR", "MIR": "SER", "HZP": "PRO", "LTA": "UNK", "HIP": "HIS", "PPN": "PHE", "APK": "LYS", "HPE": "PHE", "SVX": "SER", "JJK": "CYS", "03Y": "CYS", "D4P": "UNK", "1AC": "ALA", "B3X": "ASN", "0FL": "ALA", "2KK": "LYS", "LMQ": "GLN", "RE0": "TRP", "MSO": "MET", "ZYJ": "PRO", "GMA": "GLU", "DPR": "PRO", "1TY": "TYR", "TOX": "TRP", "DPL": "PRO", "M2S": "MET", "4HT": "TRP", "BUC": "CYS", "C1S": "CYS", "TA4": "UNK", "CSO": "CYS", "5CW": "TRP", "TRW": "TRP", "DCY": "CYS", "DAL": "ALA", "0QL": "CYS", "THC": "THR", "FGP": "SER", "MCS": "CYS", "AZH": "ALA", "HIQ": "HIS", "ABA": "ASN", "TH6": "THR", "FHL": "LYS", "ZAL": "ALA", "ICY": "CYS", "IZO": "MET", "F2F": "PHE", "VAI": "VAL", "TY5": "TYR", "07O": "CYS", "AA4": "ALA", "RGL": "ARG", "SAC": "SER", "PXU": "PRO", "NFA": "PHE", "LA2": "LYS", "0BN": "PHE", "LYK": "LYS", "FTY": "TYR", "NZH": "HIS", "CSJ": "CYS", "30V": "CYS", "DLE": "LEU", "TLY": "LYS", "L3O": "LEU", "LDH": "LYS", "NEP": "HIS", "ALY": "LYS", "GPL": "LYS", "01W": "UNK", "WRP": "TRP", "MCL": "LYS", "2AS": "UNK", "CSU": "CYS", "SOC": "CYS", "HRG": "ARG", "NMC": "GLY", "TYO": "TYR", "LHC": "UNK", "D11": "THR", "I2M": "ILE", "TTS": "TYR", "FC0": "PHE", "HIC": "HIS", "YPZ": "TYR", "5CS": "CYS", "SEP": "SER", "BBC": "CYS", "3MY": "TYR", "HQA": "ALA", "11Q": "PRO", "AGM": "ARG", "BG1": "SER", "IAS": "ASP", "SBL": "SER", "56A": "HIS", "FTR": "TRP", "DIV": "VAL", "ALO": "THR", "BTK": "LYS"} +UNCOMMON_RESIDUE_DICT = { + "DLY": "LYS", + "OTH": "THR", + "GHP": "GLY", + "YOF": "TYR", + "HS9": "HIS", + "HVA": "VAL", + "C5C": "CYS", + "TMD": "THR", + "NC1": "SER", + "CSR": "CYS", + "LYP": "LYS", + "PR4": "PRO", + "KPI": "LYS", + "02K": "ALA", + "4AW": "TRP", + "MLE": "LEU", + "NMM": "ARG", + "DNE": "LEU", + "NYS": "CYS", + "SEE": "SER", + "DSG": "ASN", + "ALA": "ALA", + "CSA": "CYS", + "SCH": "CYS", + "TQQ": "TRP", + "PTM": "TYR", + "XPR": "PRO", + "VLL": "UNK", + "B3Y": "TYR", + "PAQ": "TYR", + "FME": "MET", + "NAL": "ALA", + "TYI": "TYR", + "OXX": "ASP", + "CSS": "CYS", + "OCS": "CYS", + "193": "UNK", + "GLJ": "GLU", + "PM3": "PHE", + "DTR": "TRP", + "MEQ": "GLN", + "HSO": "HIS", + "TYW": "TYR", + "LED": "LEU", + "PHL": "PHE", + "TDD": "LEU", + "MEA": "PHE", + "FGA": "GLU", + "GGL": "GLU", + "PSH": "HIS", + "3CF": "PHE", + "MSE": "MET", + "2SO": "HIS", + "B3S": "SER", + "PSW": "SEC", + "C4R": "CYS", + "XCP": "UNK", + "LYF": "LYS", + "WFP": "PHE", + "A8E": "VAL", + "0AF": "TRP", + "PEC": "CYS", + "JJJ": "CYS", + "3TY": "UNK", + "SVY": "SER", + "DIL": "ILE", + "MHS": "HIS", + "MME": "MET", + "MMO": "ARG", + "B3A": "ALA", + "CHG": "UNK", + "PHI": "PHE", + "AR2": "ARG", + "MND": "ASN", + "BTR": "TRP", + "AEI": "ASP", + "TIH": "ALA", + "DDE": "HIS", + "S1H": "SER", + "DSE": "SER", + "AR4": "GLU", + "FDL": "LYS", + "PRJ": "PRO", + "CY3": "CYS", + "2TY": "TYR", + "AR7": "ARG", + "CTH": "THR", + "DTY": "TYR", + "SYS": "CYS", + "C1X": "LYS", + "SVV": "SER", + "ASN": "ASN", + "SNC": "CYS", + "AKZ": "ASP", + "OMY": "TYR", + "JJL": "CYS", + "XSN": "ASN", + "0UO": "TRP", + "TCQ": "TYR", + "OSE": "SER", + "NPH": "CYS", + "0A0": "ASP", + "1PA": "PHE", + "SIC": "CYS", + "TY8": "TYR", + "AYA": "ALA", + "ALN": "ALA", + "SXE": "SER", + "B3T": "UNK", + "BB9": "CYS", + "HL2": "LEU", + "0AR": "ARG", + "SVA": "SER", + "DBB": "THR", + "KPY": "LYS", + "DPP": "ALA", + "32S": "UNK", + "FGL": "GLY", + "N80": "PRO", + "IGL": "GLY", + "PF5": "PHE", + "OYL": "HIS", + "MNL": "LEU", + "PBF": "PHE", + "CEA": "CYS", + "OHI": "HIS", + "ESC": "MET", + "2JG": "SER", + "1X6": "SER", + "4BF": "TYR", + "MAA": "ALA", + "3X9": "CYS", + "BFD": "ASP", + "CZ2": "CYS", + "23P": "ALA", + "I4G": "GLY", + "CMT": "CYS", + "LVN": "VAL", + "OAS": "SER", + "TY2": "TYR", + "SCS": "CYS", + "PFX": "UNK", + "MF3": "UNK", + "OBS": "LYS", + "GL3": "GLY", + "0A9": "PHE", + "MVA": "VAL", + "B3Q": "UNK", + "DOA": "UNK", + "MP8": "PRO", + "CYR": "CYS", + "5PG": "GLY", + "ILY": "LYS", + "DNW": "ALA", + "BCX": "CYS", + "AZK": "LYS", + "AAR": "ARG", + "TRN": "TRP", + "NBQ": "TYR", + "RVX": "SER", + "PSA": "PHE", + "Z3E": "THR", + "OCY": "CYS", + "2ZC": "SER", + "N2C": "UNK", + "SBD": "SER", + "MSA": "GLY", + "SET": "SER", + "HS8": "HIS", + "SMF": "PHE", + "HYP": "PRO", + "PYX": "CYS", + "XPL": "PYL", + "DMK": "ASP", + "BIF": "PHE", + "M3L": "LYS", + "CYF": "CYS", + "O12": "UNK", + "SRZ": "SER", + "LAL": "ALA", + "2MR": "ARG", + "4PH": "PHE", + "2LT": "TYR", + "LPL": "UNK", + "3YM": "TYR", + "LRK": "LYS", + "FVA": "VAL", + "MED": "MET", + "ILM": "ILE", + "6CL": "LYS", + "CXM": "MET", + "DHV": "VAL", + "PR3": "CYS", + "HAR": "ARG", + "KWS": "GLY", + "SAR": "GLY", + "0LF": "PRO", + "45F": "PRO", + "12A": "A", + "CLG": "LYS", + "DHI": "HIS", + "PTR": "TYR", + "DMT": "UNK", + "OMT": "MET", + "TBG": "VAL", + "PLJ": "PRO", + "IAM": "ALA", + "DBY": "TYR", + "CPC": "UNK", + "GLZ": "GLY", + "4FW": "TRP", + "SLZ": "LYS", + "HIA": "HIS", + "FOE": "CYS", + "IYR": "TYR", + "KST": "LYS", + "B3M": "UNK", + "BB6": "CYS", + "CYW": "CYS", + "MPQ": "GLY", + "HHK": "LYS", + "HGL": "UNK", + "SE7": "ALA", + "ELY": "LYS", + "TRO": "TRP", + "DNP": "ALA", + "MK8": "LEU", + "200": "PHE", + "WVL": "VAL", + "LPD": "PRO", + "NCB": "ALA", + "DDZ": "ALA", + "MYK": "LYS", + "OLD": "HIS", + "DYS": "CYS", + "LET": "LYS", + "ESB": "TYR", + "HR7": "ARG", + "DI7": "TYR", + "QCS": "CYS", + "ASA": "ASP", + "CSX": "CYS", + "P3Q": "TYR", + "OHS": "ASP", + "SOY": "SER", + "EHP": "PHE", + "ZCL": "PHE", + "32T": "UNK", + "AHB": "ASN", + "TRX": "TRP", + "0AK": "ASP", + "TH5": "THR", + "GHG": "GLN", + "XW1": "ALA", + "23F": "PHE", + "1OP": "TYR", + "AGT": "CYS", + "PYA": "ALA", + "2MT": "PRO", + "4FB": "PRO", + "CSB": "CYS", + "TRQ": "TRP", + "MDO": "GLY", + "CAS": "CYS", + "TTQ": "TRP", + "T0I": "TYR", + "LLY": "LYS", + "GVL": "SER", + "BPE": "CYS", + "0TD": "ASP", + "TYY": "TYR", + "BH2": "ASP", + "D3P": "GLY", + "CY4": "CYS", + "CHP": "GLY", + "DFO": "UNK", + "NLB": "LEU", + "QPH": "PHE", + "DTH": "THR", + "LLO": "LYS", + "LYN": "LYS", + "DPN": "PHE", + "EFC": "CYS", + "FP9": "PRO", + "OMX": "TYR", + "AGQ": "TYR", + "PHD": "ASP", + "PR9": "PRO", + "B3L": "UNK", + "LYX": "LYS", + "IT1": "LYS", + "DBU": "THR", + "0A8": "CYS", + "TYX": "UNK", + "QMM": "GLN", + "CME": "CYS", + "ACB": "ASP", + "TRF": "TRP", + "HOX": "PHE", + "DA2": "ARG", + "DNS": "LYS", + "BIL": "UNK", + "SUN": "SER", + "TYJ": "TYR", + "3PX": "PRO", + "CLD": "SER", + "IPG": "GLY", + "CLH": "LYS", + "XCN": "CYS", + "CZZ": "CYS", + "THO": "UNK", + "CY1": "CYS", + "CYS": "CYS", + "PFF": "PHE", + "MLL": "LEU", + "PG1": "SER", + "BMT": "THR", + "CSZ": "CYS", + "DSN": "SER", + "NIY": "TYR", + "FH7": "LYS", + "CGV": "CYS", + "SVZ": "SER", + "ORQ": "ARG", + "DLS": "LYS", + "DVA": "VAL", + "BHD": "ASP", + "TPQ": "TYR", + "STY": "TYR", + "CSP": "CYS", + "31Q": "CYS", + "B3E": "GLU", + "LEF": "LEU", + "GLH": "GLU", + "LCK": "LYS", + "GME": "GLU", + "FHO": "LYS", + "MDH": "UNK", + "ECC": "GLN", + "34E": "VAL", + "ASB": "ASP", + "HCS": "UNK", + "KYN": "TRP", + "OIC": "UNK", + "VR0": "ARG", + "U2X": "TYR", + "PHE": "PHE", + "TYS": "TYR", + "SBG": "SER", + "A5N": "ASN", + "CYD": "CYS", + "4DP": "TRP", + "3AH": "HIS", + "FCL": "PHE", + "PRV": "GLY", + "CYQ": "CYS", + "MBQ": "TYR", + "DAS": "ASP", + "CS4": "CYS", + "B3K": "LYS", + "NLE": "LEU", + "143": "CYS", + "PR7": "PRO", + "DAH": "PHE", + "LE1": "VAL", + "TQZ": "CYS", + "LGY": "LYS", + "CML": "CYS", + "CSW": "CYS", + "N10": "SER", + "2RX": "SER", + "TOQ": "TRP", + "0AH": "SER", + "P2Q": "TYR", + "CYG": "CYS", + "DGL": "GLU", + "KOR": "MET", + "DAR": "ARG", + "2ML": "LEU", + "PTH": "TYR", + "CCS": "CYS", + "HMR": "ARG", + "33X": "ALA", + "UN2": "UNK", + "IML": "ILE", + "4CY": "MET", + "ZZJ": "ALA", + "DFI": "UNK", + "TIS": "SER", + "LLP": "LYS", + "MHU": "PHE", + "QPA": "CYS", + "175": "GLY", + "SAH": "CYS", + "IIL": "ILE", + "BCS": "CYS", + "R4K": "TRP", + "TYQ": "TYR", + "NCY": "UNK", + "FT6": "TRP", + "OBF": "UNK", + "0CS": "ALA", + "4HL": "TYR", + "TXY": "TYR", + "DOH": "ASP", + "CSE": "CYS", + "DAB": "ALA", + "GLK": "GLU", + "TYN": "TYR", + "LEI": "VAL", + "M0H": "CYS", + "CLB": "SER", + "MGG": "ARG", + "CGU": "GLU", + "UF0": "SER", + "SLL": "LYS", + "ML3": "LYS", + "HPH": "PHE", + "SME": "MET", + "ALC": "ALA", + "ASL": "ASP", + "CHS": "UNK", + "2TL": "THR", + "HT7": "TRP", + "SGB": "SER", + "OPR": "ARG", + "B3D": "ASP", + "FLT": "TYR", + "DGN": "GLN", + "4CF": "PHE", + "HLU": "LEU", + "FZN": "LYS", + "C6C": "CYS", + "HTI": "CYS", + "OMH": "SER", + "WLU": "LEU", + "23S": "UNK", + "U3X": "PHE", + "SEB": "SER", + "DBZ": "ALA", + "BB7": "CYS", + "2RA": "ALA", + "SCY": "CYS", + "6CW": "TRP", + "AHP": "ALA", + "ARO": "ARG", + "RE3": "TRP", + "1TQ": "TRP", + "VDL": "UNK", + "4IN": "TRP", + "GFT": "SER", + "CPI": "UNK", + "LSO": "LYS", + "CGA": "GLU", + "MLZ": "LYS", + "HTR": "TRP", + "00C": "CYS", + "FAK": "LYS", + "PRS": "PRO", + "ME0": "MET", + "SDP": "SER", + "HSL": "SER", + "C3Y": "CYS", + "823": "ASN", + "PHA": "PHE", + "LYZ": "LYS", + "HTN": "ASN", + "LP6": "LYS", + "ALV": "ALA", + "NVA": "VAL", + "CSD": "CYS", + "DMH": "ASN", + "PG9": "GLY", + "PCA": "GLU", + "KCX": "LYS", + "MDF": "TYR", + "TYB": "TYR", + "MHL": "LEU", + "GNC": "GLN", + "NLO": "LEU", + "MEN": "ASN", + "POM": "PRO", + "2HF": "HIS", + "CY0": "CYS", + "ZYK": "PRO", + "R1A": "CYS", + "CAF": "CYS", + "YCM": "CYS", + "ORN": "ALA", + "H5M": "PRO", + "MLY": "LYS", + "KYQ": "LYS", + "DPQ": "TYR", + "MIS": "SER", + "TPO": "THR", + "XX1": "LYS", + "SMC": "CYS", + "DHA": "SER", + "MGN": "GLN", + "FLA": "ALA", + "ILX": "ILE", + "QIL": "ILE", + "2KP": "LYS", + "CS1": "CYS", + "HNC": "CYS", + "PRK": "LYS", + "LYR": "LYS", + "DM0": "LYS", + "TSY": "CYS", + "NYB": "CYS", + "MHO": "MET", + "KFP": "LYS", + "SEN": "SER", + "999": "ASP", + "VLM": "UNK", + "CMH": "CYS", + "ONL": "UNK", + "M2L": "LYS", + "LME": "GLU", + "AIB": "ALA", + "CYJ": "LYS", + "CS3": "CYS", + "WPA": "PHE", + "MTY": "TYR", + "MIR": "SER", + "HZP": "PRO", + "LTA": "UNK", + "HIP": "HIS", + "PPN": "PHE", + "APK": "LYS", + "HPE": "PHE", + "SVX": "SER", + "JJK": "CYS", + "03Y": "CYS", + "D4P": "UNK", + "1AC": "ALA", + "B3X": "ASN", + "0FL": "ALA", + "2KK": "LYS", + "LMQ": "GLN", + "RE0": "TRP", + "MSO": "MET", + "ZYJ": "PRO", + "GMA": "GLU", + "DPR": "PRO", + "1TY": "TYR", + "TOX": "TRP", + "DPL": "PRO", + "M2S": "MET", + "4HT": "TRP", + "BUC": "CYS", + "C1S": "CYS", + "TA4": "UNK", + "CSO": "CYS", + "5CW": "TRP", + "TRW": "TRP", + "DCY": "CYS", + "DAL": "ALA", + "0QL": "CYS", + "THC": "THR", + "FGP": "SER", + "MCS": "CYS", + "AZH": "ALA", + "HIQ": "HIS", + "ABA": "ASN", + "TH6": "THR", + "FHL": "LYS", + "ZAL": "ALA", + "ICY": "CYS", + "IZO": "MET", + "F2F": "PHE", + "VAI": "VAL", + "TY5": "TYR", + "07O": "CYS", + "AA4": "ALA", + "RGL": "ARG", + "SAC": "SER", + "PXU": "PRO", + "NFA": "PHE", + "LA2": "LYS", + "0BN": "PHE", + "LYK": "LYS", + "FTY": "TYR", + "NZH": "HIS", + "CSJ": "CYS", + "30V": "CYS", + "DLE": "LEU", + "TLY": "LYS", + "L3O": "LEU", + "LDH": "LYS", + "NEP": "HIS", + "ALY": "LYS", + "GPL": "LYS", + "01W": "UNK", + "WRP": "TRP", + "MCL": "LYS", + "2AS": "UNK", + "CSU": "CYS", + "SOC": "CYS", + "HRG": "ARG", + "NMC": "GLY", + "TYO": "TYR", + "LHC": "UNK", + "D11": "THR", + "I2M": "ILE", + "TTS": "TYR", + "FC0": "PHE", + "HIC": "HIS", + "YPZ": "TYR", + "5CS": "CYS", + "SEP": "SER", + "BBC": "CYS", + "3MY": "TYR", + "HQA": "ALA", + "11Q": "PRO", + "AGM": "ARG", + "BG1": "SER", + "IAS": "ASP", + "SBL": "SER", + "56A": "HIS", + "FTR": "TRP", + "DIV": "VAL", + "ALO": "THR", + "BTK": "LYS", +} UNCOMMON_RES_CONVERSION = True diff --git a/src/aposteriori/data_prep/cli.py b/src/aposteriori/data_prep/cli.py index d0ef86d..2275235 100644 --- a/src/aposteriori/data_prep/cli.py +++ b/src/aposteriori/data_prep/cli.py @@ -1,16 +1,16 @@ -import click import pathlib import sys import typing as t import warnings +import click + from aposteriori.data_prep.create_frame_data_set import ( Codec, - make_frame_dataset, StrOrPath, default_atom_filter, download_pdb_from_csv_file, - filter_structures_by_blacklist, + make_frame_dataset, ) @@ -113,7 +113,7 @@ @click.option( "-ae", "--atom_encoder", - type=click.Choice(["CNO", "CNOCB", "CNOCBCA"]), + type=click.Choice(["CNO", "CNOCB", "CNOCBCA", "CNOCBCAQ", "CNOCBCAP"]), default="CNO", required=True, help=( @@ -164,9 +164,7 @@ "--tag_rotamers", type=bool, default=False, - help=( - "Whether to tag rotamer information to the frame (True) or not (False)." - ), + help=("Whether to tag rotamer information to the frame (True) or not (False)."), ) def cli( structure_file_folder: str, @@ -271,12 +269,18 @@ def cli( codec = Codec.CNOCB() elif atom_encoder == "CNOCBCA": codec = Codec.CNOCBCA() + elif atom_encoder == "CNOCBCAQ": + codec = Codec.CNOCBCAQ() + elif atom_encoder == "CNOCBCAP": + codec = Codec.CNOCBCAP() else: assert atom_encoder in [ "CNO", "CNOCB", "CNOCBCA", - ], f"Expected encoder to be CNO, CNOCB, CNOCBCA but got {atom_encoder}" + "CNOCBCAQ", + "CNOCBCAP", + ], f"Expected encoder to be CNO, CNOCB, CNOCBCA, CNOCBCAQ, CNOCBCAP, but got {atom_encoder}" make_frame_dataset( structure_files=structure_files, diff --git a/src/aposteriori/data_prep/create_frame_data_set.py b/src/aposteriori/data_prep/create_frame_data_set.py index c8b78ca..a9207f8 100644 --- a/src/aposteriori/data_prep/create_frame_data_set.py +++ b/src/aposteriori/data_prep/create_frame_data_set.py @@ -14,20 +14,18 @@ import urllib import warnings from dataclasses import dataclass -from multiprocessing import Pool from itertools import repeat - +from multiprocessing import Pool import ampal import ampal.geometry as geometry import h5py import numpy as np +from ampal.amino_acids import residue_charge, polarity_Zimmerman, standard_amino_acids -from ampal.amino_acids import standard_amino_acids from aposteriori.config import ( ATOM_VANDERWAAL_RADII, MAKE_FRAME_DATASET_VER, - PDB_PATH, PDB_REQUEST_URL, UNCOMMON_RESIDUE_DICT, ) @@ -43,6 +41,7 @@ class ResidueResult: voxels_as_gaussian: bool rotamers: str + @dataclass class DatasetMetadata: make_frame_dataset_ver: str @@ -104,6 +103,14 @@ def CNOCB(cls): def CNOCBCA(cls): return cls(["C", "N", "O", "CB", "CA"]) + @classmethod + def CNOCBCAQ(cls): + return cls(["C", "N", "O", "CB", "CA", "Q"]) + + @classmethod + def CNOCBCAP(cls): + return cls(["C", "N", "O", "CB", "CA", "P"]) + def encode_atom(self, atom_label: str) -> np.ndarray: """ Encodes atoms in a boolean array depending on the type of encoding chosen. @@ -288,6 +295,22 @@ def encode_cb_to_ampal_residue(residue: ampal.Residue): return +def encode_cb_prevox(residue: ampal.Residue): + """ + Encodes a Cb atom to all of the AMPAL residues before the voxelisation begins. The Cb is added to an average position + calculated by averaging the Cb coordinates of the aligned frames for the 1QYS protein. + + Parameters + ---------- + residue: ampal.Residue + Focus residues that requires the Cb atom. + + """ + align_to_residue_plane(residue) + encode_cb_to_ampal_residue(residue) + return + + def within_frame(frame_edge_length: float, atom: ampal.Atom) -> bool: """Tests if an atom is within the `frame_edge_length` of the origin.""" half_frame_edge_length = frame_edge_length / 2 @@ -427,7 +450,7 @@ def convert_atom_to_gaussian_density( # Calculate Density: voxel_density = np.exp( -((vx - x) ** 2 + (vy - y) ** 2 + (vz - z) ** 2) - / wanderwaal_radius ** 2 + / wanderwaal_radius**2 ) # Add density to frame: gaussian_frame[vy, vx, vz] = voxel_density @@ -486,6 +509,7 @@ def add_gaussian_at_position( atom_coord: t.Tuple[int, int, int], atom_idx: int, atomic_center: t.Tuple[int, int, int] = (1, 1, 1), + normalize: bool = True, ) -> np.ndarray: """ Adds a 3D array (of a gaussian atom) to a specific coordinate of a frame. @@ -534,7 +558,8 @@ def add_gaussian_at_position( + empty_frame_voxels.shape[2], # max z ] # Normalize local densities by sum of all densities (so that they all sum up to 1): - density_matrix_slice /= np.sum(density_matrix_slice) + if normalize: + density_matrix_slice /= np.sum(density_matrix_slice) # Slice the Frame to select the portion that contains the atom of interest: frame_slice = empty_frame_voxels[ max(atom_coord[0] - int(density_matrix_slice.shape[0] / 2), 0) : max( @@ -609,12 +634,16 @@ def create_residue_frame( voxel_edge_length = frame_edge_length / voxels_per_side assembly = residue.parent.parent chain = residue.parent + if "P" in codec.atomic_labels: + if residue.mol_letter in standard_amino_acids.keys(): + res_property = -1 if polarity_Zimmerman[residue.mol_letter] < 20 else 1 + else: + res_property = 0 + # res_property = -1 if res_property < 20 else 1 + elif "Q" in codec.atomic_labels: + res_property = residue_charge[residue.mol_letter] align_to_residue_plane(residue) - # Create a Cb atom at avg postion: - if "CB" in codec.atomic_labels: - if encode_cb: - encode_cb_to_ampal_residue(residue) frame = np.zeros( (voxels_per_side, voxels_per_side, voxels_per_side, codec.encoder_length), @@ -649,9 +678,10 @@ def create_residue_frame( # If the voxel is a gaussian, there may be remnants of a nearby atom # hence this test would fail if not voxels_as_gaussian: - np.testing.assert_array_equal( - frame[indices], np.array([False] * len(frame[indices]), dtype=bool) - ) + if not atom.res_label == "CB": + np.testing.assert_array_equal( + frame[indices], np.array([False] * len(frame[indices]), dtype=bool) + ) # Encode atoms: if voxels_as_gaussian: modifiers_triple = calculate_atom_coord_modifier_within_voxel( @@ -669,10 +699,29 @@ def create_residue_frame( atom_coord=indices, atom_idx=atom_idx, ) + if ( + "Q" in codec.atomic_labels + or "P" in codec.atomic_labels + and res_property != 0 + ): + gaussian_atom = gaussian_matrix[:, :, :, atom_idx] * float(res_property) + # Add at position: + frame = add_gaussian_at_position( + main_matrix=frame, + secondary_matrix=gaussian_atom, + atom_coord=indices, + atom_idx=5, + normalize=False, + ) else: # Encode atom as voxel: frame[indices] = Codec.encode_atom(codec, atom.res_label) - + if ( + "Q" in codec.atomic_labels + or "P" in codec.atomic_labels + and res_property != 0 + ): + frame[indices] = res_property centre = voxels_per_side // 2 # Check whether central atom is C: if "CA" in codec.atomic_labels: @@ -735,6 +784,13 @@ def voxelise_assembly( if not atom_filter_fn(atom): del atom.parent.atoms[atom.res_label] del atom + if "CB" in codec.atomic_labels: + if encode_cb: + for chain in assembly: + if not isinstance(chain, ampal.Polypeptide): + continue + for residue in chain: + encode_cb_prevox(residue) remaining_atoms = len(list(assembly.get_atoms())) print(f"{name}: Filtered {total_atoms - remaining_atoms} of {total_atoms} atoms.") for chain in assembly: @@ -770,7 +826,9 @@ def voxelise_assembly( if any(v is None for v in residue.tags["rotamers"]): rota = "NAN" else: - rota = "".join(np.array(residue.tags["rotamers"], dtype=str).tolist()) + rota = "".join( + np.array(residue.tags["rotamers"], dtype=str).tolist() + ) else: rota = "NAN" # Save results: @@ -781,7 +839,7 @@ def voxelise_assembly( encoded_residue=encoded_residue, data=array, voxels_as_gaussian=voxels_as_gaussian, - rotamers=rota + rotamers=rota, ) ) if verbosity > 1: @@ -804,7 +862,7 @@ def create_frames_from_structure( codec: object, voxels_as_gaussian: bool, voxelise_all_states: bool, - tag_rotamers: bool + tag_rotamers: bool, ) -> t.Tuple[str, ChainDict]: """Creates residue frames for each residue in the structure. @@ -865,6 +923,7 @@ def create_frames_from_structure( voxels_as_gaussian, tag_rotamers, ) + result.append(curr_result) else: if isinstance(assembly, ampal.AmpalContainer): @@ -957,7 +1016,7 @@ def process_single_path( codec, voxels_as_gaussian=voxels_as_gaussian, voxelise_all_states=voxelise_all_states, - tag_rotamers=tag_rotamers + tag_rotamers=tag_rotamers, ) except Exception as e: result = str(e) @@ -1127,7 +1186,7 @@ def process_paths( codec, voxels_as_gaussian, voxelise_all_states, - tag_rotamers + tag_rotamers, ), ) for proc_i in range(processes) @@ -1335,7 +1394,10 @@ def _fetch_pdb( ) elif len(pdb_code) == 4: if voxelise_all_states: - if isinstance(pdb_structure, ampal.AmpalContainer) and len(pdb_structure) > 1: + if ( + isinstance(pdb_structure, ampal.AmpalContainer) + and len(pdb_structure) > 1 + ): ext_pdb = output_path.suffix out_paths = [] @@ -1578,7 +1640,7 @@ def make_frame_dataset( print(f"Will attempt to process {total_files} structure file/s.") print(f"Output file will be written to `{output_file_path.resolve()}`.") voxel_edge_length = frame_edge_length / voxels_per_side - max_voxel_distance = np.sqrt(voxel_edge_length ** 2 * 3) + max_voxel_distance = np.sqrt(voxel_edge_length**2 * 3) print(f"Frame edge length = {frame_edge_length:.2f} A") print(f"Voxels per side = {voxels_per_side}") print(f"Voxels will have an edge length of {voxel_edge_length:.2f} A.") diff --git a/tests/test_create_frame_data_set.py b/tests/test_create_frame_data_set.py index 4ed475c..0b15d83 100644 --- a/tests/test_create_frame_data_set.py +++ b/tests/test_create_frame_data_set.py @@ -1,44 +1,95 @@ """Tests data processing functionality in src/aposteriori/create_frame_dataset.py""" -from pathlib import Path import copy import tempfile +from pathlib import Path -from hypothesis import given, settings -from hypothesis.strategies import integers import ampal import ampal.geometry as g import aposteriori.data_prep.create_frame_data_set as cfds +from aposteriori.data_prep.create_frame_data_set import default_atom_filter import h5py import numpy as np import numpy.testing as npt import pytest +from ampal.amino_acids import residue_charge, polarity_Zimmerman, standard_amino_acids +from hypothesis import given, settings +from hypothesis.strategies import integers + +import aposteriori.data_prep.create_frame_data_set as cfds TEST_DATA_DIR = Path("tests/testing_files/pdb_files/") +def test_cb_position(): + assembly = ampal.load_pdb(str(TEST_DATA_DIR / "3qy1.pdb")) + frame_edge_length = 12.0 + voxels_per_side = 21 + codec = cfds.Codec.CNOCB() + cfds.voxelise_assembly( + assembly, + name="3qy1", + atom_filter_fn=default_atom_filter, + frame_edge_length=frame_edge_length, + voxels_per_side=voxels_per_side, + encode_cb=True, + codec=codec, + tag_rotamers=False, + chain_dict={}, + voxels_as_gaussian=False, + verbosity=1, + chain_filter_list=["A", "B"], + ) + + for chain in assembly: + for residue in chain: + if not isinstance(residue, ampal.Residue): + continue + cfds.align_to_residue_plane(residue) + assert np.isclose( + residue["CB"].x, + (residue["CA"].x - 0.741287356), + ), f"The Cb has not been encoded at position X = -0.741287356" + assert np.isclose( + residue["CB"].y, + (residue["CA"].y - 0.53937931), + ), f"The Cb has not been encoded at position Y = -0.53937931" + assert np.isclose( + residue["CB"].z, + (residue["CA"].z - 1.224287356), + ), f"The Cb has not been encoded at position Z = -1.224287356" + + @settings(deadline=1500) @given(integers(min_value=0, max_value=214)) def test_create_residue_frame_cnocb_encoding(residue_number): + assert (TEST_DATA_DIR / "3qy1.pdb").exists(), "File does not exist" assembly = ampal.load_pdb(str(TEST_DATA_DIR / "3qy1.pdb")) focus_residue = assembly[0][residue_number] # Make sure that residue correctly aligns peptide plane to XY - cfds.align_to_residue_plane(focus_residue) - cfds.encode_cb_to_ampal_residue(focus_residue) + cfds.encode_cb_prevox(focus_residue) assert np.array_equal( - focus_residue["CA"].array, (0, 0, 0,) + focus_residue["CA"].array, + ( + 0, + 0, + 0, + ), ), "The CA atom should lie on the origin." assert np.isclose(focus_residue["N"].x, 0), "The nitrogen atom should lie on XY." assert np.isclose(focus_residue["N"].z, 0), "The nitrogen atom should lie on XY." assert np.isclose(focus_residue["C"].z, 0), "The carbon atom should lie on XY." assert np.isclose( - focus_residue["CB"].x, -0.741287356, + focus_residue["CB"].x, + -0.741287356, ), f"The Cb has not been encoded at position X = -0.741287356" assert np.isclose( - focus_residue["CB"].y, -0.53937931, + focus_residue["CB"].y, + -0.53937931, ), f"The Cb has not been encoded at position Y = -0.53937931" assert np.isclose( - focus_residue["CB"].z, -1.224287356, + focus_residue["CB"].z, + -1.224287356, ), f"The Cb has not been encoded at position Z = -1.224287356" # Make sure that all relevant atoms are pulled into the frame frame_edge_length = 12.0 @@ -58,19 +109,38 @@ def test_create_residue_frame_cnocb_encoding(residue_number): codec = cfds.Codec.CNOCB() # Make sure that aligned residue sits on XY after it is discretized single_res_assembly = ampal.Assembly( - molecules=ampal.Polypeptide(monomers=copy.deepcopy(focus_residue).backbone) + molecules=ampal.Polypeptide( + monomers=copy.deepcopy(focus_residue).backbone, polymer_id="A" + ) ) # Need to reassign the parent so that the residue is the only thing in the assembly single_res_assembly[0].parent = single_res_assembly single_res_assembly[0][0].parent = single_res_assembly[0] - array = cfds.create_residue_frame( - single_res_assembly[0][0], frame_edge_length, voxels_per_side, encode_cb=True, codec=codec) - np.testing.assert_array_equal(array[centre, centre, centre], [True, False, False, False], err_msg="The central atom should be CA.") - nonzero_indices = list(zip(*np.nonzero(array))) + chaindict = cfds.voxelise_assembly( + single_res_assembly[0][0].parent.parent, + name="3qy1", + atom_filter_fn=default_atom_filter, + frame_edge_length=frame_edge_length, + voxels_per_side=voxels_per_side, + encode_cb=True, + codec=codec, + tag_rotamers=False, + chain_dict={}, + voxels_as_gaussian=False, + verbosity=1, + chain_filter_list=["A"], + )[1] + array_test = chaindict["A"][0].data + np.testing.assert_array_equal( + array_test[centre, centre, centre], + [True, False, False, False], + err_msg="The central atom should be CA.", + ) + nonzero_indices = list(zip(*np.nonzero(array_test))) assert ( len(nonzero_indices) == 5 ), "There should be only 5 backbone atoms in this frame" - nonzero_on_xy_indices = list(zip(*np.nonzero(array[:, :, centre]))) + nonzero_on_xy_indices = list(zip(*np.nonzero(array_test[:, :, centre]))) assert ( 3 <= len(nonzero_on_xy_indices) <= 4 ), "N, CA and C should lie on the xy plane." @@ -85,7 +155,12 @@ def test_create_residue_frame_backbone_only(residue_number): # Make sure that residue correctly aligns peptide plane to XY cfds.align_to_residue_plane(focus_residue) assert np.array_equal( - focus_residue["CA"].array, (0, 0, 0,) + focus_residue["CA"].array, + ( + 0, + 0, + 0, + ), ), "The CA atom should lie on the origin." assert np.isclose(focus_residue["N"].x, 0), "The nitrogen atom should lie on XY." assert np.isclose(focus_residue["N"].z, 0), "The nitrogen atom should lie on XY." @@ -115,10 +190,17 @@ def test_create_residue_frame_backbone_only(residue_number): # Obtain atom encoder: codec = cfds.Codec.CNO() array = cfds.create_residue_frame( - single_res_assembly[0][0], frame_edge_length, voxels_per_side, - encode_cb=False, codec=codec + single_res_assembly[0][0], + frame_edge_length, + voxels_per_side, + encode_cb=False, + codec=codec, + ) + np.testing.assert_array_equal( + array[centre, centre, centre], + [True, False, False], + err_msg="The central atom should be CA.", ) - np.testing.assert_array_equal(array[centre, centre, centre], [True, False, False], err_msg="The central atom should be CA.") nonzero_indices = list(zip(*np.nonzero(array))) assert ( len(nonzero_indices) == 4 @@ -145,7 +227,7 @@ def test_even_voxels_per_side(voxels_per_side): voxels_per_side=voxels_per_side, require_confirmation=False, encode_cb=True, - codec=codec + codec=codec, ) @@ -187,7 +269,7 @@ def test_make_frame_dataset(): verbosity=1, require_confirmation=False, codec=codec, - tag_rotamers=True + tag_rotamers=True, ) with h5py.File(output_file_path, "r") as dataset: for n in range(1, 77): @@ -202,9 +284,11 @@ def test_make_frame_dataset(): codec=codec, ) rota = "" - for r in ampal_1ubq[0][n-1].tags["rotamers"]: + for r in ampal_1ubq[0][n - 1].tags["rotamers"]: rota += str(r) - assert rota == dataset["1ubq"]["A"][residue_number].attrs["rotamers"], f'Tags Rotamer mismatch found at position {n}: {dataset["1ubq"]["A"][residue_number].attrs["rotamers"]} but expected {rota}' + assert ( + rota == dataset["1ubq"]["A"][residue_number].attrs["rotamers"] + ), f'Tags Rotamer mismatch found at position {n}: {dataset["1ubq"]["A"][residue_number].attrs["rotamers"]} but expected {rota}' hdf5_array = dataset["1ubq"]["A"][residue_number][()] npt.assert_array_equal( hdf5_array, @@ -218,13 +302,17 @@ def test_make_frame_dataset(): def test_convert_atom_to_gaussian_density(): # No modifiers: - opt_frame = cfds.convert_atom_to_gaussian_density((0,0,0), 0.6, optimized=True) - non_opt_frame = cfds.convert_atom_to_gaussian_density((0,0,0), 0.6, optimized=False) + opt_frame = cfds.convert_atom_to_gaussian_density((0, 0, 0), 0.6, optimized=True) + non_opt_frame = cfds.convert_atom_to_gaussian_density( + (0, 0, 0), 0.6, optimized=False + ) np.testing.assert_array_almost_equal(opt_frame, non_opt_frame, decimal=2) np.testing.assert_almost_equal(np.sum(non_opt_frame), np.sum(opt_frame)) # With modifiers: opt_frame = cfds.convert_atom_to_gaussian_density((0.5, 0, 0), 0.6, optimized=True) - non_opt_frame = cfds.convert_atom_to_gaussian_density((0.5, 0, 0), 0.6, optimized=False) + non_opt_frame = cfds.convert_atom_to_gaussian_density( + (0.5, 0, 0), 0.6, optimized=False + ) np.testing.assert_array_almost_equal(opt_frame, non_opt_frame, decimal=2) @@ -277,6 +365,152 @@ def test_make_frame_dataset_as_gaussian(): ) +def test_make_frame_dataset_as_gaussian_cnocacbq(): + """Tests the creation of a frame data set.""" + test_file = TEST_DATA_DIR / "1ubq.pdb" + frame_edge_length = 18.0 + voxels_per_side = 31 + codec = cfds.Codec.CNOCBCAQ() + ampal_1ubq = ampal.load_pdb(str(test_file)) + ampal_1ubq2 = ampal.load_pdb(str(test_file)) + + test_frame = cfds.voxelise_assembly( + ampal_1ubq2, + name="1ubq", + atom_filter_fn=default_atom_filter, + frame_edge_length=frame_edge_length, + voxels_per_side=voxels_per_side, + encode_cb=True, + codec=codec, + tag_rotamers=False, + chain_dict={}, + voxels_as_gaussian=True, + verbosity=1, + chain_filter_list=["A"], + )[1] + + array_test = [] + for k in range(76): + array_test.append(test_frame["A"][k].data) + + for atom in ampal_1ubq.get_atoms(): + if not cfds.default_atom_filter(atom): + del atom.parent.atoms[atom.res_label] + del atom + with tempfile.TemporaryDirectory() as tmpdir: + # Obtain atom encoder: + output_file_path = cfds.make_frame_dataset( + structure_files=[test_file], + output_folder=tmpdir, + name="test_dataset", + frame_edge_length=frame_edge_length, + voxels_per_side=voxels_per_side, + verbosity=1, + require_confirmation=False, + codec=codec, + voxels_as_gaussian=True, + ) + with h5py.File(output_file_path, "r") as dataset: + for n in range(1, 77): + # check that the frame for all the data frames match between the input + # arrays and the ones that come out of the HDF5 data set + residue_number = str(n) + test_residue = array_test[n - 1] + hdf5_array = dataset["1ubq"]["A"][residue_number][()] + npt.assert_array_equal( + hdf5_array, + test_residue, + err_msg=( + "The frame in the HDF5 data set should be the same as the " + "input frame." + ), + ) + charge = residue_charge[ampal_1ubq["A"][residue_number].mol_letter] + if charge > 0: + assert np.max(test_residue[:, :, :, 5]) > 0 + if charge < 0: + assert np.min(test_residue[:, :, :, 5]) < 0 + + +def test_make_frame_dataset_as_gaussian_cnocacbp(): + """Tests the creation of a frame data set.""" + test_file = TEST_DATA_DIR / "1ubq.pdb" + frame_edge_length = 18.0 + voxels_per_side = 31 + codec = cfds.Codec.CNOCBCAP() + + ampal_1ubq = ampal.load_pdb(str(test_file)) + ampal_1ubq2 = ampal.load_pdb(str(test_file)) + + test_frame = cfds.voxelise_assembly( + ampal_1ubq2, + name="1ubq", + atom_filter_fn=default_atom_filter, + frame_edge_length=frame_edge_length, + voxels_per_side=voxels_per_side, + encode_cb=True, + codec=codec, + tag_rotamers=False, + chain_dict={}, + voxels_as_gaussian=True, + verbosity=1, + chain_filter_list=["A"], + )[1] + + array_test = [] + for k in range(76): + array_test.append(test_frame["A"][k].data) + + for atom in ampal_1ubq.get_atoms(): + if not cfds.default_atom_filter(atom): + del atom.parent.atoms[atom.res_label] + del atom + with tempfile.TemporaryDirectory() as tmpdir: + # Obtain atom encoder: + output_file_path = cfds.make_frame_dataset( + structure_files=[test_file], + output_folder=tmpdir, + name="test_dataset", + frame_edge_length=frame_edge_length, + voxels_per_side=voxels_per_side, + verbosity=1, + require_confirmation=False, + codec=codec, + voxels_as_gaussian=True, + ) + with h5py.File(output_file_path, "r") as dataset: + for n in range(1, 77): + # check that the frame for all the data frames match between the input + # arrays and the ones that come out of the HDF5 data set + residue_number = str(n) + residue_test = array_test[n - 1] + hdf5_array = dataset["1ubq"]["A"][residue_number][()] + npt.assert_array_equal( + hdf5_array, + residue_test, + err_msg=( + "The frame in the HDF5 data set should be the same as the " + "input frame." + ), + ) + if ( + ampal_1ubq["A"][residue_number].mol_letter + in standard_amino_acids.keys() + ): + polarity = ( + -1 + if polarity_Zimmerman[ampal_1ubq["A"][residue_number].mol_letter] + < 20 + else 1 + ) + else: + polarity = 0 + if polarity == 1: + assert np.max(residue_test[:, :, :, 5]) > 0 + if polarity == 0: + assert np.min(residue_test[:, :, :, 5]) < 0 + + @settings(deadline=700) @given(integers(min_value=0, max_value=214)) def test_default_atom_filter(residue_number: int): @@ -287,7 +521,9 @@ def test_default_atom_filter(residue_number: int): for atom in focus_residue: filtered_atom = True if atom.res_label in backbone_atoms else False filtered_scenario = cfds.default_atom_filter(atom) - assert filtered_atom == filtered_scenario, f"Expected {atom.res_label} to return {filtered_atom} after filter" + assert ( + filtered_atom == filtered_scenario + ), f"Expected {atom.res_label} to return {filtered_atom} after filter" @settings(deadline=700) @@ -300,7 +536,9 @@ def test_cb_atom_filter(residue_number: int): for atom in focus_residue: filtered_atom = True if atom.res_label in backbone_atoms else False filtered_scenario = cfds.keep_sidechain_cb_atom_filter(atom) - assert filtered_atom == filtered_scenario, f"Expected {atom.res_label} to return {filtered_atom} after filter" + assert ( + filtered_atom == filtered_scenario + ), f"Expected {atom.res_label} to return {filtered_atom} after filter" def test_add_gaussian_at_position(): @@ -308,16 +546,18 @@ def test_add_gaussian_at_position(): modifiers_triple = (0, 0, 0) codec = cfds.Codec.CNOCBCA() - secondary_matrix, atom_idx = codec.encode_gaussian_atom( - "C", modifiers_triple - ) + secondary_matrix, atom_idx = codec.encode_gaussian_atom("C", modifiers_triple) atom_coord = (1, 1, 1) - added_matrix = cfds.add_gaussian_at_position(main_matrix, secondary_matrix[:,:,:, atom_idx], atom_coord, atom_idx) + added_matrix = cfds.add_gaussian_at_position( + main_matrix, secondary_matrix[:, :, :, atom_idx], atom_coord, atom_idx + ) # Check general sum: np.testing.assert_array_almost_equal(np.sum(added_matrix), 1.0, decimal=2) # Check center: - assert (0 < added_matrix[1, 1, 1][0] < 1), f"The central atom should be 1 but got {main_matrix[1, 1, 1, 0]}." + assert ( + 0 < added_matrix[1, 1, 1][0] < 1 + ), f"The central atom should be 1 but got {main_matrix[1, 1, 1, 0]}." # Check middle points (in each direction so 6 total points): # +---+---+---+ # | _ | X | _ | @@ -325,43 +565,155 @@ def test_add_gaussian_at_position(): # | _ | X | _ | # +---+---+---+ # Where 0 is the central atom - np.testing.assert_array_almost_equal(added_matrix[1, 0, 1, 0], added_matrix[0, 1, 1, 0], decimal=2, err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[1, 0, 1, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[1, 1, 0, 0], added_matrix[0, 1, 1, 0], decimal=2, err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[1, 1, 0, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[1, 1, 2, 0], added_matrix[0, 1, 1, 0], decimal=2, err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[1, 1, 2, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[1, 2, 1, 0], added_matrix[0, 1, 1, 0], decimal=2, err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[1, 2, 1, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[2, 1, 1, 0], added_matrix[0, 1, 1, 0], decimal=2, err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[2, 1, 1, 0]}.") + np.testing.assert_array_almost_equal( + added_matrix[1, 0, 1, 0], + added_matrix[0, 1, 1, 0], + decimal=2, + err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[1, 0, 1, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[1, 1, 0, 0], + added_matrix[0, 1, 1, 0], + decimal=2, + err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[1, 1, 0, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[1, 1, 2, 0], + added_matrix[0, 1, 1, 0], + decimal=2, + err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[1, 1, 2, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[1, 2, 1, 0], + added_matrix[0, 1, 1, 0], + decimal=2, + err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[1, 2, 1, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[2, 1, 1, 0], + added_matrix[0, 1, 1, 0], + decimal=2, + err_msg=f"The atom should be {added_matrix[0, 1, 1, 0]} but got {main_matrix[2, 1, 1, 0]}.", + ) # Check inner corners (in each direction so 12 total points): # +---+---+---+ # | X | _ | X | # | _ | 0 | _ | # | X | _ | X | # +---+---+---+ - np.testing.assert_array_almost_equal(added_matrix[0, 1, 0, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[0, 1, 0, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[0, 1, 2, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[0, 1, 2, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[0, 2, 1, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[0, 2, 1, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[1, 0, 0, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[1, 0, 0, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[1, 0, 2, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[1, 0, 2, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[1, 2, 0, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[1, 2, 0, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[1, 2, 2, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[1, 2, 2, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[2, 0, 1, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[2, 0, 1, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[2, 1, 0, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[2, 1, 0, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[2, 1, 2, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[2, 1, 2, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[2, 2, 1, 0], added_matrix[0, 0, 1, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[2, 2, 1, 0]}.") + np.testing.assert_array_almost_equal( + added_matrix[0, 1, 0, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[0, 1, 0, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[0, 1, 2, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[0, 1, 2, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[0, 2, 1, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[0, 2, 1, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[1, 0, 0, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[1, 0, 0, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[1, 0, 2, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[1, 0, 2, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[1, 2, 0, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[1, 2, 0, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[1, 2, 2, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[1, 2, 2, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[2, 0, 1, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[2, 0, 1, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[2, 1, 0, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[2, 1, 0, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[2, 1, 2, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[2, 1, 2, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[2, 2, 1, 0], + added_matrix[0, 0, 1, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 1, 0]} but got {added_matrix[2, 2, 1, 0]}.", + ) # Check outer corners(in each direction so 8 total points): # +---+---+---+ # | X | _ | X | # | _ | _ | _ | # | X | _ | X | # +---+---+---+ - np.testing.assert_array_almost_equal(added_matrix[0, 2, 0, 0], added_matrix[0, 0, 2, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[0, 2, 0, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[0, 2, 2, 0], added_matrix[0, 0, 2, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[0, 2, 2, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[2, 0, 0, 0], added_matrix[0, 0, 2, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[2, 0, 0, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[2, 0, 2, 0], added_matrix[0, 0, 2, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[2, 0, 2, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[2, 2, 0, 0], added_matrix[0, 0, 2, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[2, 2, 0, 0]}.") - np.testing.assert_array_almost_equal(added_matrix[2, 2, 2, 0], added_matrix[0, 0, 2, 0], decimal=4, err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[2, 2, 2, 0]}.") + np.testing.assert_array_almost_equal( + added_matrix[0, 2, 0, 0], + added_matrix[0, 0, 2, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[0, 2, 0, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[0, 2, 2, 0], + added_matrix[0, 0, 2, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[0, 2, 2, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[2, 0, 0, 0], + added_matrix[0, 0, 2, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[2, 0, 0, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[2, 0, 2, 0], + added_matrix[0, 0, 2, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[2, 0, 2, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[2, 2, 0, 0], + added_matrix[0, 0, 2, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[2, 2, 0, 0]}.", + ) + np.testing.assert_array_almost_equal( + added_matrix[2, 2, 2, 0], + added_matrix[0, 0, 2, 0], + decimal=4, + err_msg=f"The atom should be {added_matrix[0, 0, 2, 0]} but got {added_matrix[2, 2, 2, 0]}.", + ) # Add additional point and check whether the sum is 2: atom_coord = (2, 2, 2) - added_matrix = cfds.add_gaussian_at_position(added_matrix, secondary_matrix[:,:,:, atom_idx], atom_coord, atom_idx) + added_matrix = cfds.add_gaussian_at_position( + added_matrix, secondary_matrix[:, :, :, atom_idx], atom_coord, atom_idx + ) np.testing.assert_array_almost_equal(np.sum(added_matrix), 2.0, decimal=2) # Add point in top left corner and check whether the normalization still adds up to 1: # +---+---+---+ @@ -371,39 +723,55 @@ def test_add_gaussian_at_position(): # +---+---+---+ # We are keeping all the X and 0 atom_coord = (0, 0, 0) - added_matrix = cfds.add_gaussian_at_position(main_matrix, secondary_matrix[:,:,:, atom_idx], atom_coord, atom_idx) + added_matrix = cfds.add_gaussian_at_position( + main_matrix, secondary_matrix[:, :, :, atom_idx], atom_coord, atom_idx + ) np.testing.assert_array_almost_equal(np.sum(added_matrix), 3.0, decimal=2) np.testing.assert_array_less(added_matrix[0, 0, 0][0], 1) - assert (0 < added_matrix[0, 0, 0][0] <= 1), f"The central atom value should be between 0 and 1 but was {added_matrix[0, 0, 0][0]}" + assert ( + 0 < added_matrix[0, 0, 0][0] <= 1 + ), f"The central atom value should be between 0 and 1 but was {added_matrix[0, 0, 0][0]}" # Testing N, O, Ca, Cb atom channels. Adding atoms at (0, 0, 0) in different channels: - N_secondary_matrix, N_atom_idx = codec.encode_gaussian_atom( - "N", modifiers_triple + N_secondary_matrix, N_atom_idx = codec.encode_gaussian_atom("N", modifiers_triple) + added_matrix = cfds.add_gaussian_at_position( + main_matrix, N_secondary_matrix[:, :, :, N_atom_idx], atom_coord, N_atom_idx ) - added_matrix = cfds.add_gaussian_at_position(main_matrix, N_secondary_matrix[:,:,:, N_atom_idx], atom_coord, N_atom_idx) np.testing.assert_array_almost_equal(np.sum(added_matrix), 4.0, decimal=2) np.testing.assert_array_less(added_matrix[0, 0, 0][N_atom_idx], 1) - assert (0 < added_matrix[0, 0, 0][N_atom_idx] <= 1), f"The central atom value should be between 0 and 1 but was {added_matrix[0, 0, 0][N_atom_idx]}" - O_secondary_matrix, O_atom_idx = codec.encode_gaussian_atom( - "O", modifiers_triple + assert ( + 0 < added_matrix[0, 0, 0][N_atom_idx] <= 1 + ), f"The central atom value should be between 0 and 1 but was {added_matrix[0, 0, 0][N_atom_idx]}" + O_secondary_matrix, O_atom_idx = codec.encode_gaussian_atom("O", modifiers_triple) + added_matrix = cfds.add_gaussian_at_position( + main_matrix, O_secondary_matrix[:, :, :, O_atom_idx], atom_coord, O_atom_idx ) - added_matrix = cfds.add_gaussian_at_position(main_matrix, O_secondary_matrix[:,:,:, O_atom_idx], atom_coord, O_atom_idx) np.testing.assert_array_almost_equal(np.sum(added_matrix), 5.0, decimal=2) np.testing.assert_array_less(added_matrix[0, 0, 0][O_atom_idx], 1) - assert (0 < added_matrix[0, 0, 0][O_atom_idx] <= 1), f"The central atom value should be between 0 and 1 but was {added_matrix[0, 0, 0][O_atom_idx]}" + assert ( + 0 < added_matrix[0, 0, 0][O_atom_idx] <= 1 + ), f"The central atom value should be between 0 and 1 but was {added_matrix[0, 0, 0][O_atom_idx]}" CA_secondary_matrix, CA_atom_idx = codec.encode_gaussian_atom( "CA", modifiers_triple ) - added_matrix = cfds.add_gaussian_at_position(main_matrix, CA_secondary_matrix[:,:,:, CA_atom_idx], atom_coord, CA_atom_idx) + added_matrix = cfds.add_gaussian_at_position( + main_matrix, CA_secondary_matrix[:, :, :, CA_atom_idx], atom_coord, CA_atom_idx + ) np.testing.assert_array_almost_equal(np.sum(added_matrix), 6.0, decimal=2) np.testing.assert_array_less(added_matrix[0, 0, 0][CA_atom_idx], 1) - assert (0 < added_matrix[0, 0, 0][CA_atom_idx] <= 1), f"The central atom value should be between 0 and 1 but was {added_matrix[0, 0, 0][CA_atom_idx]}" + assert ( + 0 < added_matrix[0, 0, 0][CA_atom_idx] <= 1 + ), f"The central atom value should be between 0 and 1 but was {added_matrix[0, 0, 0][CA_atom_idx]}" CB_secondary_matrix, CB_atom_idx = codec.encode_gaussian_atom( "CB", modifiers_triple ) - added_matrix = cfds.add_gaussian_at_position(main_matrix, CB_secondary_matrix[:,:,:, CB_atom_idx], atom_coord, CB_atom_idx) + added_matrix = cfds.add_gaussian_at_position( + main_matrix, CB_secondary_matrix[:, :, :, CB_atom_idx], atom_coord, CB_atom_idx + ) np.testing.assert_array_almost_equal(np.sum(added_matrix), 7.0, decimal=2) np.testing.assert_array_less(added_matrix[0, 0, 0][CB_atom_idx], 1) - assert (0 < added_matrix[0, 0, 0][CB_atom_idx] <= 1), f"The central atom value should be between 0 and 1 but was {CB_atom_idx[0, 0, 0][CA_atom_idx]}" + assert ( + 0 < added_matrix[0, 0, 0][CB_atom_idx] <= 1 + ), f"The central atom value should be between 0 and 1 but was {CB_atom_idx[0, 0, 0][CA_atom_idx]}" def test_download_pdb_from_csv_file(): @@ -445,15 +813,15 @@ def test_download_pdb_from_csv_file(): voxelise_all_states=True, ) assert ( - TEST_DATA_DIR / "1qys.pdb" + TEST_DATA_DIR / "1qys.pdb" ).exists(), f"Expected download of 1QYS to return PDB file" assert ( - TEST_DATA_DIR / "3qy1A.pdb" + TEST_DATA_DIR / "3qy1A.pdb" ).exists(), f"Expected download of 3QYA to return PDB file" (TEST_DATA_DIR / "1qys.pdb").unlink(), (TEST_DATA_DIR / "3qy1A.pdb").unlink() for i in range(0, 10): - pdb_code = f'6ct4_{i}.pdb' + pdb_code = f"6ct4_{i}.pdb" new_paths = TEST_DATA_DIR / pdb_code assert new_paths.exists(), f"Could not find path {new_paths} for {pdb_code}" new_paths.unlink()