Source code for schrodinger.rdkit.sorted_molwriter
"""
This module contains the SortedMolWriter class, which writes a sorted,
capped list of molecules to a SMILES-CSV file. If capping at <n>
molecules, SortedMolWriter holds a maximum of <n> rows of data in memory,
where a row consists of the SMILES string, molecule name and a dictionary
of properties.
Copyright Schrodinger LLC, All Rights Reserved.
"""
import csv
import heapq
from rdkit import Chem
from schrodinger.infra import mm
from schrodinger.rdkit import rdkit_adapter
from schrodinger.utils import fileutils
[docs]class SortedMolWriter:
[docs] def __init__(self,
csv_file,
sort_prop,
max_mol,
sort_reverse=False,
user_props=None):
"""
:param csv_file: Output SMILES-CSV file name. Will be compressed
if csv_file ends with 'gz'.
:type csv_file: str
:param sort_prop: The name of the property on which molecules
should be sorted. Must be present for all molecules.
:type sort_prop: str
:param max_mol: Cap molecules at this number. Take care if using
a large value, as this determines the number of rows held
in memory. Capping can be disabled by supplying a negative
value, but this should be done only if one knows the total
number of rows will fit comfortably in memory.
:type max_mol: int
:param sort_reverse: Whether to sort by decreasing value of
sort_prop
:type sort_reverse: bool
:param user_props: Names of properties to output. Empty fields are
written for missing properties. By default, the union of
all properties encountered are output.
:type user_props: Iterable of str or NoneType
"""
self._fh = fileutils.open_maybe_compressed(csv_file, 'wt', newline='')
self.sort_prop = sort_prop
self.max_mol = max_mol
# heapq.heappushpop always keeps highest values, so negate if
# increasing sort.
self._heap_sign = 1 if sort_reverse else -1
self._props_union = {sort_prop}
self._user_props = []
if user_props:
for prop in user_props:
if prop != mm.M2IO_DATA_CT_TITLE:
self._props_union.add(prop)
self._user_props = list(self._props_union)
self._row_heap = []
self._mol_counter = 0
[docs] def add(self, mol):
"""
Adds a molecule to the capped list.
:param mol: The molecule to add
:type mol: rdkit.Chem.rdchem.Mol
"""
smiles = Chem.MolToSmiles(mol)
mol_name = mol.GetProp('_Name')
props_dict = self._digestProps(mol)
sort_value = self._heap_sign * props_dict[self.sort_prop]
self._mol_counter += 1 # For stable sort.
row = (sort_value, -self._mol_counter, smiles, mol_name, props_dict)
if len(self._row_heap) == self.max_mol:
heapq.heappushpop(self._row_heap, row)
else:
heapq.heappush(self._row_heap, row)
def __enter__(self):
return self
def __exit__(self, *args):
self._writeCsvFile()
self._fh.close()
def _digestProps(self, mol):
"""
If outputting all properties, this function updates the union
of all property names and returns a dictionary of all properties.
If outputting user properties, this function simply returns a
dictionary of the user properties.
:param mol: Molecule whose properties are to be digested
:type mol: rdkit.Chem.rdchem.Mol
:return: Name:Value dictionary of properties from mol
:rtype: dict
"""
all_props_dict = rdkit_adapter.translate_rdkit_props_dict(
mol.GetPropsAsDict())
if self._user_props:
user_props_dict = {}
for prop in self._user_props:
try:
user_props_dict[prop] = all_props_dict[prop]
except KeyError:
pass
return user_props_dict
else:
self._props_union.update(all_props_dict.keys())
return all_props_dict
def _writeCsvFile(self):
fields = ['SMILES', 'NAME'] + sorted(self._props_union)
writer = csv.DictWriter(self._fh, fieldnames=fields)
writer.writeheader()
# self._heap_sign ensures that reverse sort is always needed here.
for row in sorted(self._row_heap, reverse=True):
props_dict = row[4]
props_dict['SMILES'] = row[2]
props_dict['NAME'] = row[3]
writer.writerow(props_dict)