Source code for schrodinger.application.matsci.unique_species

"""
Module to calculate unique species from structure object

Copyright Schrodinger, LLC. All rights reserved.
"""

from collections import defaultdict

from schrodinger.application.matsci import msutils
from schrodinger.application.matsci import rdpattern
from schrodinger.structutils import analyze

MOLECULE_NUM = 'i_matsci_molecule_number'


[docs]class UniqueMolecules: """ Class to calculate unique molecules in a structure. """
[docs] def __init__(self, struct): """ Constructs a new instance of UniqueMolecules :param struct: The structure :type struct: `structure.Structure` """ self.struct = struct.copy() self.unique_mol_nums = self.getUniqueMols()
def _getAllMolSts(self): """ Extracts all the molecules from the structure and set MOLECULE_NUM property to map them to original structure :returns: All extracted molecules from structure :rtype: list(`structure.Structure`) """ mol_sts = [] for mol in self.struct.molecule: mol_st = mol.extractStructure() mol_st.property[MOLECULE_NUM] = mol.number mol_sts.append(mol_st) return mol_sts def _splitUniqueValue(self, prop_values): """ Flattens all the items in the values of the input dict into two lists. Each value of the input dictionary should be a list. Items from lists of length 1 are returned in the first list, items from lists of length > 1 are returned in the second list. :param dict prop_values: values are lists of items :rtype: (list, list) :returns: The first list contains all the items that were in 1-item lists. The second list contains all the items that were in lists > 1 item long. """ unique_vals, non_unique_vals = [], [] for values in prop_values.values(): if len(values) == 1: unique_vals.append(values[0]) else: non_unique_vals.extend(values) return unique_vals, non_unique_vals
[docs] def splitUniqueMolsUsingNumAtoms(self, mol_sts): """ Splits unique mols and non-unique molecules using number atoms. Unique molecules have unique number of atoms in the system. :param list mol_sts: A list of extracted molecule structures :returns: The first element is the list of molecules with unique number of atoms and second element is the list of molecule that have non-unique number of atoms :rtype: tuple(list, list) """ num_atoms_mol = defaultdict(list) for mol in mol_sts: num_atoms_mol[mol.atom_total].append(mol) return self._splitUniqueValue(num_atoms_mol)
[docs] def splitUniqueMolsUsingFormula(self, mol_sts): """ Splits unique mols and non-unique molecules using molecular formula. Unique molecules have a unique molecular formula. :param list mol_sts: A list of extracted molecule structures :returns: The first element is the list of molecules with unique molecular formula and second element is the list of molecule that have non-unique molecular formula :rtype: tuple(list, list) """ formula_mol = defaultdict(list) for mol in mol_sts: formula = analyze.generate_molecular_formula(mol) formula_mol[formula].append(mol) return self._splitUniqueValue(formula_mol)
[docs] def getUniqueMolsFromSmarts(self, mol_sts): """ Get representative molecule for each unique molecular SMARTS. :param list mol_sts: A list of extracted molecule structures :returns: Representative structure for each molecule with unique molecular SMARTS :rtype: list(`structure.Structure`) """ unique_mols = [] seen_smarts = set() for mol in mol_sts: smarts = rdpattern.to_smarts(mol) if smarts not in seen_smarts: seen_smarts.add(smarts) unique_mols.append(mol) return unique_mols
[docs] def getUniqueMols(self): """ Get the molecule number of unique representative molecules in the structure :rtype: list :return: list of molecule numbers that are unique """ all_mol_structs = self._getAllMolSts() # Separate the molecules based on number of atoms num_unique_mols, non_unique_mols = self.splitUniqueMolsUsingNumAtoms( all_mol_structs) # Separate the non-unique molecules based on the chemical formula formula_unique_mols, non_unique_mols = self.splitUniqueMolsUsingFormula( non_unique_mols) # Find the representative molecules for each SMARTS among the non-unique # molecules. This is the robust method but takes significant amount of # time. The above two methods try to reduce the load on this function smart_unique_mols = self.getUniqueMolsFromSmarts(non_unique_mols) unique_mols = smart_unique_mols + formula_unique_mols + num_unique_mols unique_mols_nums = [x.property[MOLECULE_NUM] for x in unique_mols] return unique_mols_nums
[docs] def getUniqueStruct(self): """ Gets the structure comprising only unique representative molecules. :returns: The structure with only unique molecules :rtype: `structure.Structure` """ unique_aids = msutils.flatten([ self.struct.molecule[x].getAtomIndices() for x in self.unique_mol_nums ]) unique_st = self.struct.extract(unique_aids, True) return unique_st