Source code for schrodinger.rdkit.sorted_molwriter

"""
This module contains the SortedMolWriter class, which writes a sorted,
capped list of molecules to a SMILES-CSV file. If capping at <n>
molecules, SortedMolWriter holds a maximum of <n> rows of data in memory,
where a row consists of the SMILES string, molecule name and a dictionary
of properties.

Copyright Schrodinger LLC, All Rights Reserved.
"""

import csv
import heapq

from rdkit import Chem

from schrodinger.infra import mm
from schrodinger.rdkit import rdkit_adapter
from schrodinger.utils import fileutils


[docs]class SortedMolWriter:

[docs]    def __init__(self,
                 csv_file,
                 sort_prop,
                 max_mol,
                 sort_reverse=False,
                 user_props=None):
        """
        :param csv_file: Output SMILES-CSV file name. Will be compressed
                if csv_file ends with 'gz'.
        :type csv_file: str

        :param sort_prop: The name of the property on which molecules
                should be sorted. Must be present for all molecules.
        :type sort_prop: str

        :param max_mol: Cap molecules at this number. Take care if using
                a large value, as this determines the number of rows held
                in memory. Capping can be disabled by supplying a negative
                value, but this should be done only if one knows the total
                number of rows will fit comfortably in memory.
        :type max_mol: int

        :param sort_reverse: Whether to sort by decreasing value of
                sort_prop
        :type sort_reverse: bool

        :param user_props: Names of properties to output. Empty fields are
                written for missing properties. By default, the union of
                all properties encountered are output.
        :type user_props: Iterable of str or NoneType
        """

        self._fh = fileutils.open_maybe_compressed(csv_file, 'wt', newline='')
        self.sort_prop = sort_prop
        self.max_mol = max_mol
        # heapq.heappushpop always keeps highest values, so negate if
        # increasing sort.
        self._heap_sign = 1 if sort_reverse else -1
        self._props_union = {sort_prop}
        self._user_props = []
        if user_props:
            for prop in user_props:
                if prop != mm.M2IO_DATA_CT_TITLE:
                    self._props_union.add(prop)
            self._user_props = list(self._props_union)
        self._row_heap = []
        self._mol_counter = 0

[docs]    def add(self, mol):
        """
        Adds a molecule to the capped list.

        :param mol: The molecule to add
        :type mol: rdkit.Chem.rdchem.Mol
        """

        smiles = Chem.MolToSmiles(mol)
        mol_name = mol.GetProp('_Name')
        props_dict = self._digestProps(mol)
        sort_value = self._heap_sign * props_dict[self.sort_prop]
        self._mol_counter += 1  # For stable sort.
        row = (sort_value, -self._mol_counter, smiles, mol_name, props_dict)
        if len(self._row_heap) == self.max_mol:
            heapq.heappushpop(self._row_heap, row)
        else:
            heapq.heappush(self._row_heap, row)

    def __enter__(self):
        return self

    def __exit__(self, *args):
        self._writeCsvFile()
        self._fh.close()

    def _digestProps(self, mol):
        """
        If outputting all properties, this function updates the union
        of all property names and returns a dictionary of all properties.
        If outputting user properties, this function simply returns a
        dictionary of the user properties.

        :param mol: Molecule whose properties are to be digested
        :type mol: rdkit.Chem.rdchem.Mol

        :return: Name:Value dictionary of properties from mol
        :rtype: dict
        """

        all_props_dict = rdkit_adapter.translate_rdkit_props_dict(
            mol.GetPropsAsDict())

        if self._user_props:
            user_props_dict = {}
            for prop in self._user_props:
                try:
                    user_props_dict[prop] = all_props_dict[prop]
                except KeyError:
                    pass
            return user_props_dict
        else:
            self._props_union.update(all_props_dict.keys())
            return all_props_dict

    def _writeCsvFile(self):
        fields = ['SMILES', 'NAME'] + sorted(self._props_union)
        writer = csv.DictWriter(self._fh, fieldnames=fields)
        writer.writeheader()
        # self._heap_sign ensures that reverse sort is always needed here.
        for row in sorted(self._row_heap, reverse=True):
            props_dict = row[4]
            props_dict['SMILES'] = row[2]
            props_dict['NAME'] = row[3]
            writer.writerow(props_dict)