Source code for schrodinger.application.bioluminate.propka_parse

"""
Module used to parse the output from a propka job.


"""

#- Imports -------------------------------------------------------------------

import re

#- Globals -------------------------------------------------------------------

REGEXS = {
    'break': re.compile(r'(^---+|^\s*$)'),
    'detailed': re.compile(r'^\s*RESIDUE\s+pKa\s+BURIED[ \t\w]\n$'),
    'summary': re.compile(r'^\s*Group\s+pKa\s+model-pKa'
                          r'\s+ligand atom-type\s*$'),
    'free_ene': re.compile(r'^Free energy of'),
    'charge': re.compile(r'^\s*pH\s+unfolded\s+folded\s*$'),
    'pI': re.compile(r'^\s*The\s+pI\s+is\s+'
                     r'(?P<folded_pI>-?\d*\.\d*)'
                     r'\s+\(folded\)\s+and\s+'
                     r'(?P<unfolded_pI>-?\d*\.\d*)'
                     r'\s+\(unfolded\)\s*$')
}

DETAILED_HEADER = [
    'resname', 'resnum', 'chain', 'pKa', 'buried', 'desolvation regular',
    'effects re', 'sidechain h-bond', 'backbone h-bond', 'coulombic interaction'
]
SUMMARY_HEADER = [
    'resname', 'resnum', 'chain', 'pKa', 'pKmodel', 'ligand atom-type'
]
FREE_ENERGY_HEADER = ['pH', 'free energy']
CHARGE_HEADER = ['pH', 'unfolded', 'folded']

#- Functions -----------------------------------------------------------------


[docs]def get_detailed(pka_file, headers=None): """ Get the detailed report from the PROPKA output file. This will return a list of lists. The first list is the "header" and the remaining lists will be the values corresponding to the headers. :param pka_file: The name of the propka output (usually `<jobname>.pka`) :type pka_file: string :param headers: A list of header to return in the summary. Only these headers and their corresponding data are returned. If this is None, all headers and values are returned. :see: `DETAILED_HEADERS` """
[docs]def get_summary(pka_file, headers=None): """ Get the summary from the PROPKA output file. This will return a list of headers and a list of lists. The list of lists will be the values corresponding to the headers. :param pka_file: The name of the propka output (usually `<jobname>.pka`) :type pka_file: string :param headers: A list of header to return in the summary. Only these headers and their corresponding data are returned. If this is None, all headers and values are returned. :see: `SUMMARY_HEADERS` """ headers = headers or SUMMARY_HEADER indices = [SUMMARY_HEADER.index(h) for h in headers] table_data = [] with open(pka_file) as lines: get_data = False for line in lines: if REGEXS.get('summary').search(line): get_data = True continue if REGEXS.get('break').search(line): get_data = False continue if not get_data: continue tokens = [item.strip() for item in line.split() if item.strip()] # Catch 4-digit residue numbers if (len(tokens[0]) >= 7): try: name = tokens.pop(0) resnum = int(name[-4:]) pdbres = name[:-4] tokens = [pdbres, resnum] + tokens except: pass # Make sure to add a value for "ligand atom-type" if none is # reported in the summary. if len(tokens) < 6: tokens.append(None) # Filter out only the tokens we need filtered_tokens = [tokens[i] for i in indices] table_data.append(filtered_tokens) return (headers, table_data)
[docs]def get_free_energy(pka_file, headers=None): """ Get the free energy of folding (kcal/mol) as a function of pH from the PROPKA output file. This will return a list of headers and a list of lists. The list of lists will be the values corresponding to the headers.This will return a list of lists. The first list is the "header" and the remaining lists will be the values corresponding to the headers. :param pka_file: The name of the propka output (usually `<jobname>.pka`) :type pka_file: string :param headers: A list of header to return in the summary. Only these headers and their corresponding data are returned. If this is None, all headers and values are returned. :see: `FREE_ENERGY_HEADERS` """ headers = headers or FREE_ENERGY_HEADER indices = [FREE_ENERGY_HEADER.index(h) for h in headers] table_data = [] with open(pka_file) as lines: get_data = False for line in lines: if REGEXS.get('free_ene').search(line): get_data = True continue if REGEXS.get('break').search(line): get_data = False continue if not get_data: continue tokens = [item.strip() for item in line.split() if item.strip()] # Filter out only the tokens we need filtered_tokens = [tokens[i] for i in indices] table_data.append(filtered_tokens) return (headers, table_data)
[docs]def get_charge(pka_file, headers=None): """ Get the protein charge of folded and unfolded state as a function of pH from the PROPKA output file. This will return a list of headers, a list of lists containing the data, and a list of pI values for folded and unfolded states. :param pka_file: The name of the propka output (usually `<jobname>.pka`) :type pka_file: string :param headers: A list of header to return in the summary. Only these headers and their corresponding data are returned. If this is None, all headers and values are returned. :see: `CHARGE_HEADERS` """ headers = headers or CHARGE_HEADER indices = [CHARGE_HEADER.index(h) for h in headers] table_data = [] pI_data = None with open(pka_file) as lines: get_data = False for line in lines: if REGEXS.get('charge').search(line): get_data = True continue if REGEXS.get('break').search(line): get_data = False continue match = REGEXS.get('pI').search(line) if match: folded_pI = match.group('folded_pI') unfolded_pI = match.group('unfolded_pI') pI_data = [folded_pI, unfolded_pI] break if not get_data: continue tokens = [item.strip() for item in line.split() if item.strip()] # Filter out only the tokens we need filtered_tokens = [tokens[i] for i in indices] table_data.append(filtered_tokens) return (headers, table_data, pI_data)
if __name__ == '__main__': import sys ifile = sys.argv[1] headers, summary = get_summary(ifile, SUMMARY_HEADER[:4]) print('Headers', headers) print('\nSummary:\n', summary)