Source code for schrodinger.rdkit.sorted_molwriter

"""
This module contains the SortedMolWriter class, which writes a sorted,
capped list of molecules to a SMILES-CSV file. If capping at <n>
molecules, SortedMolWriter holds a maximum of <n> rows of data in memory,
where a row consists of the SMILES string, molecule name and a dictionary
of properties.

Copyright Schrodinger LLC, All Rights Reserved.
"""

import csv
import heapq

from rdkit import Chem

from schrodinger.infra import mm
from schrodinger.rdkit import rdkit_adapter
from schrodinger.utils import fileutils


[docs]class SortedMolWriter:
[docs] def __init__(self, csv_file, sort_prop, max_mol, sort_reverse=False, user_props=None): """ :param csv_file: Output SMILES-CSV file name. Will be compressed if csv_file ends with 'gz'. :type csv_file: str :param sort_prop: The name of the property on which molecules should be sorted. Must be present for all molecules. :type sort_prop: str :param max_mol: Cap molecules at this number. Take care if using a large value, as this determines the number of rows held in memory. Capping can be disabled by supplying a negative value, but this should be done only if one knows the total number of rows will fit comfortably in memory. :type max_mol: int :param sort_reverse: Whether to sort by decreasing value of sort_prop :type sort_reverse: bool :param user_props: Names of properties to output. Empty fields are written for missing properties. By default, the union of all properties encountered are output. :type user_props: Iterable of str or NoneType """ self._fh = fileutils.open_maybe_compressed(csv_file, 'wt', newline='') self.sort_prop = sort_prop self.max_mol = max_mol # heapq.heappushpop always keeps highest values, so negate if # increasing sort. self._heap_sign = 1 if sort_reverse else -1 self._props_union = {sort_prop} self._user_props = [] if user_props: for prop in user_props: if prop != mm.M2IO_DATA_CT_TITLE: self._props_union.add(prop) self._user_props = list(self._props_union) self._row_heap = [] self._mol_counter = 0
[docs] def add(self, mol): """ Adds a molecule to the capped list. :param mol: The molecule to add :type mol: rdkit.Chem.rdchem.Mol """ smiles = Chem.MolToSmiles(mol) mol_name = mol.GetProp('_Name') props_dict = self._digestProps(mol) sort_value = self._heap_sign * props_dict[self.sort_prop] self._mol_counter += 1 # For stable sort. row = (sort_value, -self._mol_counter, smiles, mol_name, props_dict) if len(self._row_heap) == self.max_mol: heapq.heappushpop(self._row_heap, row) else: heapq.heappush(self._row_heap, row)
def __enter__(self): return self def __exit__(self, *args): self._writeCsvFile() self._fh.close() def _digestProps(self, mol): """ If outputting all properties, this function updates the union of all property names and returns a dictionary of all properties. If outputting user properties, this function simply returns a dictionary of the user properties. :param mol: Molecule whose properties are to be digested :type mol: rdkit.Chem.rdchem.Mol :return: Name:Value dictionary of properties from mol :rtype: dict """ all_props_dict = rdkit_adapter.translate_rdkit_props_dict( mol.GetPropsAsDict()) if self._user_props: user_props_dict = {} for prop in self._user_props: try: user_props_dict[prop] = all_props_dict[prop] except KeyError: pass return user_props_dict else: self._props_union.update(all_props_dict.keys()) return all_props_dict def _writeCsvFile(self): fields = ['SMILES', 'NAME'] + sorted(self._props_union) writer = csv.DictWriter(self._fh, fieldnames=fields) writer.writeheader() # self._heap_sign ensures that reverse sort is always needed here. for row in sorted(self._row_heap, reverse=True): props_dict = row[4] props_dict['SMILES'] = row[2] props_dict['NAME'] = row[3] writer.writerow(props_dict)