Source code for schrodinger.protein.gpcr.update_gpcr_db

"""
This script downloads sequence and residue data from the GPCR DB and stores it
in a sqlite database.

Copyright Schrodinger, LLC. All rights reserved.
"""

import contextlib
import itertools
import json
from typing import Iterable
import uuid

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

from schrodinger.utils import fileutils
from schrodinger.utils import subprocess

from . import gpcrdb
from . import sql


[docs]def create_entry_database(): """ Create a sqlite database of sequences and residues from the GPCR DB. The sqlite database is created using a random filename and must be moved to the appropriate location for use. :return: Path to sqlite database :rtype: str """ db_filename = f"{uuid.uuid4()}.sqlite" conn = _init_database(db_filename) row_gen = gpcrdb.download_all_entry_data() cur = conn.cursor() with contextlib.closing(conn): all_residues = dict() next_residue_pk, next_entry_pk = 1, 1 for i, row in enumerate(row_gen): next_residue_pk, next_entry_pk = _insert_row( cur, row, all_residues=all_residues, next_residue_pk=next_residue_pk, next_entry_pk=next_entry_pk) if i % 100 == 0: conn.commit() return db_filename
def _insert_row(cur, row_data, all_residues, next_residue_pk=1, next_entry_pk=1): """ Insert data for a single GPCR DB entry. :param cur: Database cursor :type cur: sqlite3.Cursor :param row_data: Entry data :type row_data: tuple :param all_residues: Dict of residue primary keys, keyed by the row data. Used to reduce duplicate data in the database to save space. :type all_residues: dict :param next_residue_pk: Next primary key to use in the residue table :type next_residue_pk: int :param next_entry_pk: Next primary key to use in the residue table :type next_entry_pk: int """ entry_name, res_number_scheme, sequence, families, residues = row_data entry_residue_pks = [] # Insert rows into residue table for each unknown residue for res_dict in residues: data = tuple(res_dict[key] for key in sql.RESIDUES_KEYS) existing_pk = all_residues.get(data) if existing_pk is None: cur.execute(sql.INSERT_RESIDUE_SQL, [next_residue_pk, *data]) all_residues[data] = next_residue_pk entry_residue_pks.append(next_residue_pk) next_residue_pk += 1 else: entry_residue_pks.append(existing_pk) # Insert row for entry families = json.dumps(families) cur.execute( sql.INSERT_ENTRY_SQL, [next_entry_pk, entry_name, res_number_scheme, sequence, families]) # Insert rows into entry-residue table entry_residue_data = zip(itertools.repeat(next_entry_pk), entry_residue_pks) cur.executemany(sql.INSERT_ENTRY_RESIDUES_SQL, entry_residue_data) return next_residue_pk, next_entry_pk + 1 def _init_database(filename): """ Open the database and create the tables. :rtype: sqlite3.Connection """ conn = sql.open_database(filename) cur = conn.cursor() cur.executescript(sql.CREATE_SQL) conn.commit() return conn def _get_seqs(gpcr_db_filename: str) -> Iterable[SeqRecord]: conn = sql.open_database(gpcr_db_filename) # get sequences with contextlib.closing(conn): resp = conn.execute("SELECT entry_name, sequence FROM entries") for entry_name, sequence in resp: yield SeqRecord(Seq(sequence), id=f"pdb|{entry_name}|G", description="")
[docs]def create_blast_db(gpcr_db_filename: str): """ Create a BLAST database using sequences from GPCR DB :param gpcr_db_filename: Path to database with sequences from GPCR DB """ # write sequences to fasta fasta_filename = f"{uuid.uuid4()}.fasta" with open(fasta_filename, 'w') as fh: SeqIO.write(_get_seqs(gpcr_db_filename), fh, "fasta") # call createblastdb cmd = [ 'run', '-FROM', 'psp', 'makeblastdb_29', '-in', fasta_filename, '-out', 'gpcrdb', '-dbtype', 'prot', '-title', 'gpcrdb', '-parse_seqids' ] proc = subprocess.run(cmd) proc.check_returncode() # clean up the FASTA file if makeblastdb succeeded, otherwise keep it for # debugging purposes fileutils.force_remove(fasta_filename)
[docs]def main(): db_file = create_entry_database() # TODO move database file to the appropriate location (TBD) create_blast_db(db_file)
# TODO move blast database to the appropriate location (TBD) if __name__ == "__main__": main()