Source code for schrodinger.test.stu.get_pdbs

import ftplib
import json
import math
import os
import random

import requests.exceptions

import schrodinger
from schrodinger.protein import getpdb

# File with the list of all PDB IDs.  Created if it does not already exist.
CACHE_FILE = 'pdb_id.json'

# These are very bad structures, with lots of errors. They are also very large,
# and so it takes time to figure out what went wrong. We'll skip these
# structures for now so that we can see any new problems. See SHARED-3507 for
# discussion.
PDBIDS_TO_SKIP = [
    '4BTS', '4L47', '4L71', '4LEL', '4LFZ', '4LNT', '4LSK', '4LT8', '4QYK',
    '4TVX', '4U1U', '4U1V', '4U20', '4U24', '4U25', '4U26', '4U27', '4UBV',
    '4V42', '4V4B', '4V4H', '4V4I', '4V4J', '4V4M', '4V4O', '4V4P', '4V4Q',
    '4V4R', '4V4S', '4V4T', '4V4X', '4V4Y', '4V4Z', '4V50', '4V52', '4V53',
    '4V54', '4V55', '4V56', '4V57', '4V5B', '4V5F', '4V5G', '4V5I', '4V5L',
    '4V5O', '4V5P', '4V5Q', '4V5R', '4V5S', '4V5Y', '4V64', '4V68', '4V6B',
    '4V6C', '4V6D', '4V6E', '4V6K', '4V6T', '4V6Y', '4V6Z', '4V70', '4V71',
    '4V72', '4V73', '4V74', '4V75', '4V76', '4V77', '4V78', '4V79', '4V7A',
    '4V7B', '4V7C', '4V7D', '4V7G', '4V7H', '4V7N', '4V7P', '4V7S', '4V7T',
    '4V7U', '4V7V', '4V80', '4V81', '4V85', '4V89', '4V8A', '4V8N', '4V8O',
    '4V8Q', '4V8R', '4V8S', '4V8U', '4V90', '4V91', '4V93', '4V94', '4V98',
    '4V9C', '4V9D', '4V9F', '4V9H', '4V9I', '4V9J', '4V9K', '4V9L', '4V9M',
    '4W29', '4WF1', '4WZJ'
]

# This structure is known to be invalid. See analysis in CONV-908
PDBIDS_TO_SKIP.append('1VVJ')


[docs]def write_pdb_cache(pdb_ids): if schrodinger.in_dev_env(): with open(CACHE_FILE, 'w') as fh: json.dump(pdb_ids, fh)
def _download_pdb_list(): """ Download the list of PDB ids. In the dev environment, the list is serialized. """ # See list of potential indices here: # https://www.rcsb.org/pages/general/summaries server = 'ftp.wwpdb.org' pdb_index_file = '/pub/pdb/derived_data/index/resolu.idx' ftp = ftplib.FTP(server) ftp.login() pdb_ids = [] def store(line): # e.g. "100D\t;\t1.9" if len(line) > 5 and line[5] == ';': pdb_ids.append(line[:4]) # This is slow (2-10s): ftp.retrlines(f'RETR {pdb_index_file}', store) ftp.close() pdb_ids = sorted(pdb_ids) if len(pdb_ids) == 0: # in case format changes or something. raise RuntimeError(f'No PDB IDs found at {server}{pdb_index_file}') write_pdb_cache(pdb_ids) return pdb_ids
[docs]def get_pdb_list(): """ Get all PDB IDs either from a serialized file or the PDB website. :return: list of PDB ids """ if schrodinger.in_dev_env(): try: with open(CACHE_FILE) as fh: pdbs = json.load(fh) return pdbs except OSError: pass return _download_pdb_list()
[docs]def get_pdb(pdb_id): """ Download a PDB if it is not already in the local repository :return: Filename of PDB structure file. """ local_filename = getpdb.retrieve_pdb(pdb_id) if local_filename is None: return getpdb.download_pdb(pdb_id) return local_filename
[docs]def sample_pdbs(fraction=1, number=None, min_value=0, max_value=100, allow_download_failure=True): """ Iterate over a fraction of all available structures in the PDB. For each one download the structure and yield the filename of the structure file. :param fraction: Fraction of all PDBs that should be downloaded. @yield: filenames of PDB structure files. """ all_pdbs = get_pdb_list() if number is not None: sample = random.sample(all_pdbs, number) print('%i PDBs found, %i sampled' % (len(all_pdbs), len(sample))) elif fraction != 1: sample = random.sample(all_pdbs, int(fraction * len(all_pdbs))) print('%i PDBs found, %i sampled' % (len(all_pdbs), len(sample))) else: sample = all_pdbs if min_value != 0: slice_start = int(math.floor((min_value / 100.0) * len(sample))) else: slice_start = 0 slice_end = int(math.ceil((max_value / 100.0) * len(sample))) sample = sample[slice_start:slice_end] # Delete the 90K element list. del all_pdbs for pdb_id in sample: if pdb_id.upper() in PDBIDS_TO_SKIP: continue try: # NOTE: This will download the file as *.pdb or *.cif: yield get_pdb(pdb_id) except (RuntimeError, requests.exceptions.HTTPError) as err: if '404' in str(err): print(f'{pdb_id} is missing from the PDB web server.') continue elif allow_download_failure: print('missed connection to RCSB') continue else: raise
[docs]def require_local_pdb(): """ Ensure that a local mirror of the PDB is available and active for use with getpdb. This limits hitting the pdb server. """ # Attempt to retrieve a PDB file from the local mirror filename = getpdb.retrieve_pdb('2DAN') if filename: try: if os.path.getsize(filename) > 0: os.unlink(filename) return except FileNotFoundError: pass msg = ('Local PDB repository not available. Install the PDB in ' '$SCHRODINGER/thirdparty/database or set the SCHRODINGER_PDB ' 'environment variable (probably to ' '/builds/thirdparty/current/database/pdb)') raise RuntimeError(msg)