Source code for schrodinger.structutils.sort

"""
A module for sorting structure files by Structure-level property values.
The module supports multi-key sorting, 'block' sorting, and file merging.

'sort_criteria' and 'intra_block_sort_criteria' are lists of tuples,
where each tuple is an ct-level property dataname and ascending/descending
directive for that dataname.  If a structure does not have a particular
property, it is sorted last (even when sorting in ascending order). This is
consistent with Excel and Maestro's Project table.

'Block sorting' is possible by using the auxiliary
'intra_block_sort_criteria' sort keys.  Block sorting organizes structures
into groups by the 'intra_block_sort_criteria' set of keys, then orders
those groups by their leading member's 'sort_criteria'.  Put another way,
'intra_block_sort_criteria' specifies how to organize structures *within*
a block, and 'sort_criteria' specifies how to organize the blocks.
If 'intra_block_sort_criteria' is None, then a simple multi-key sort is
performed using the 'sort_criteria'.  For example, if you have a pose
file with multiple poses for each ligand-title, a useful global order
is to have all poses with the same title in a contiguous block ordered
by Emodel values, and title-blocks ordered by the Glide score of the
first member in each title-block.

Copyright Schrodinger, LLC. All rights reserved

"""

# FIXME:
#- Change function signatures so file_name is a tuple, (name, starting
#  index) and/or change the function signatures to take structure iterators
#  structure writer, or generic callables.
#- Determine how the mechanisms behave on local and network
#  filesystems.

################################################################################
# Packages
################################################################################

import heapq
import os
import shutil
import tempfile

import schrodinger.structure as structure
import schrodinger.utils.cmdline as cmdline
import schrodinger.utils.fileutils as fileutils
import schrodinger.utils.log as log

################################################################################
# Globals/Constants
################################################################################
_version = '$Revision: 1.19 $'
ASCENDING = 1
DESCENDING = -1
CHUNK_SIZE = 2500000  # _in_memory_sort_ok constant, in bytes.

# tempfile.mkstemp constants
MKTMPSUFFIX = '.mae'  # tmp files as uncompress maestro format, '.mae' extension.

# Some predefined sort keys.
# FIXME:  Poor variable names.  These are not really 'keys', but rather
# criteria (the key and order).
# EV90000: updated key 1 to use docking score instead of gscore prop.
GLIDE_SP_KEY_1 = [("b_glide_receptor", ASCENDING),
                  ("r_i_docking_score", ASCENDING)]
GLIDE_SP_KEY_2 = [("s_m_title", ASCENDING), ("r_i_glide_emodel", ASCENDING)]
GLIDE_XP_KEY_1 = GLIDE_SP_KEY_1  # Same as SP in suite2008
GLIDE_XP_KEY_2 = [("s_m_title", ASCENDING), ("i_glide_XP_PoseRank", ASCENDING)]
GLIDE_HTVS_KEY_1 = GLIDE_SP_KEY_1  # Same as SP in suite2008
GLIDE_HTVS_KEY_2 = GLIDE_SP_KEY_2  # Same as SP in suite2008

################################################################################
# Logging
################################################################################
logger = log.get_output_logger(__file__)
logger.setLevel(log.logging.INFO)

################################################################################
# Functions
################################################################################


def _in_memory_sort_ok(file_name, chunk_size=CHUNK_SIZE):
    """
    Returns True if file_name is small enough to sort in memory, otherwise
    False.

    Test is based on the size, in bytes, of the first 100 structures in
    file_name and the number of structures in file_name. If the size in bytes
    of the first 100 structures is less than chunk_size the file is assumed to
    contain ligand-sized structures, otherwise it is assumed to contain
    receptor-sized structures. The type of structures determines a limit on the
    structure count that can be sorted in memory: 1x10^3 receptor-sized
    structures, or 1x10^4 ligand-sized structures (hardwired values). If the
    count of structures in file_name is less than the limit then the file
    should be sortable in memory.


    :param file_name: Path to the structure file on which to operate.
    :type file_name: str

    :param chunk_size: The size, in bytes, to used to estimate the scale of
        structures in the file. Default is the module constant CHUNK_SIZE.
    :type chunk_size: int

    """

    # The routine is taken from the sortpackage.py by Dan Gendler and
    # Paul Sanchagrin.  The size of a file containing the first 100
    # structures is assayed, to determine a structure count threshold.
    # The threshold value is either 1x10^4 (presumably for a file containing
    # ligand sized structures), or 1x10^3 (presumably for a file containing
    # receptor sized structures).  If the size of the sample file is
    # less than 2,500,000 then the 1x10^4 is returned, otherwise the
    # smaller 1x10^3.  If the number of structures in the file to sort is
    # greater than the threshold then the sort-split-merge mechanism is
    # recommended, otherwise a 'straight' in memory sort is recommended.

    # Write the first 100 strucures to a temporary file.
    logger.debug("_in_memory_sort_ok:  Checking first 100 structures of %s" %
                 file_name)

    # Create a temp file and writer for the 100 sts.
    temp_file_name = _get_temp_file_name(dir=None)
    st_writer = structure.StructureWriter(temp_file_name)
    for index, tst in enumerate(structure.MaestroTextReader(file_name)):
        st_writer.append(tst)
        if index == 100:
            break
    st_writer.close()

    # Determine the size of the temporary file, which provides an
    # estimate for the entire input file.
    file_size = os.path.getsize(temp_file_name)
    logger.debug("_in_memory_sort_ok:  100 structures of %s = %d bytes" %
                 (file_name, file_size))
    fileutils.force_remove(temp_file_name)

    # Determine the count threshold from the file size estimate.  If the
    # first 100 structures are bigger than the chunk_size they
    # are probably receptor-sized structures, so store a fewer in memory.
    count_threshold = None
    if file_size < chunk_size:
        count_threshold = 1e4
    else:
        count_threshold = 1e3

    # If the count of structures is less than the count_threshold it is
    # ok to sort in memory.
    for index, tst in enumerate(structure.MaestroTextReader(file_name)):
        if index > count_threshold:
            logger.debug(
                "_in_memory_sort_ok:  %s is too big to sort in memory." %
                (file_name))
            return False

    # Must be ok to sort in memory.
    logger.debug("_in_memory_sort_ok: %s is small enough to sort in memory." %
                 file_name)
    return True


[docs]def sort_file(file_name,
              sort_criteria,
              out_file_name=None,
              intra_block_sort_criteria=None,
              no_split=False):
    """
    Sort structure file by the values of ct-level properties within the file.

    This is the central API that has some logic under the hood to choose a good
    trade off between disk IO and memory use given the size of the file.

    :param file_name: Path to file upon which to operate.
    :type file_name: str

    :param sort_criteria:  List of (m2io dataname, module constant) tuples.
        These are the primary, secondary, ..., keys for sorting the structures,
        *or* blocks if intra_block_sort_criteria is defined, and optional
        ascending/descending constants. e.g.: [('s_m_title', sort.ASCENDING),
        ('r_i_glide_docking_score', sort.ASCENDING)]
    :type sort_criteria: list(tuple)

    :param out_file_name: Output structure file containing the sorted
        structures.  If out_file_name is None, then the input file is clobbered
        with the results of the sort. Default is to replace input file_name
        with sorted results.
    :type out_file_name: str

    :param intra_block_sort_criteria: Optional list of (m2io dataname,
        module constant) tuples for block sorting. These are the primary,
        secondary, ..., keys for sorting the structures *within* blocks, and
        optional ascending/descending order constants.  Default is None, don't
        block sort.
    :type intra_block_sort_criteria: list(tuple)

    """

    # Clobber input file unless output file is provided.
    if not out_file_name:
        out_file_name = file_name

    # Do the sorting in memory if possible.
    if _in_memory_sort_ok(file_name):
        logger.debug("sort_file: sorting with sort_file_in_memory")
        sort_file_in_memory(file_name, sort_criteria, out_file_name,
                            intra_block_sort_criteria)
    # For big files use the StructureFileSorter
    else:
        logger.debug("sort_file: sorting with StructureFileSorter MRWP")
        sf_sorter = StructureFileSorter(file_name,
                                        1,
                                        sort_criteria,
                                        intra_block_sort_criteria,
                                        keep_structures=False)
        sf_sorter.sort()
        sf_sorter.write(out_file_name, sf_sorter.structure_index_order)

    return


[docs]def split_file(file_name, max_count=10000, dir=None):
    """
    Returns a list of file names generated by splitting the original structures
    in file_name split into smaller files.

    :param file_name: Path to the structure file upon which to operate.
    :type file_name: str

    :param max_count: Maximum number of structures per sub-file.
    :type max_count: int

    :param dir: Path to the directory where the sub-files are written. The
        default is the runtime current working directory. There needs to be
        enough space to store effectively a copy of file_name.  For really
        large files, /tmp is not a good location for most hosts.
    :type dir: str

    """

    logger.debug("split_file in:  %s" % (file_name))
    out_file_list = []
    if dir is None:
        dir = os.getcwd()
    file_suffix = fileutils.splitext(file_name)[1]
    temp_file_name = _get_temp_file_name(dir=dir, suffix=file_suffix)
    out_file_list.append(temp_file_name)
    st_writer = structure.StructureWriter(temp_file_name)
    for index, tst in enumerate(structure.MaestroTextReader(file_name)):
        # Create a new sub file as needed.
        if index > 0 and divmod(index, max_count)[1] == 0:
            st_writer.close()  # Close the previous sub file.
            temp_file_name = _get_temp_file_name(dir=dir, suffix=file_suffix)
            out_file_list.append(temp_file_name)
            st_writer = structure.StructureWriter(temp_file_name)

        # Write the structure to the sub file.
        st_writer.append(tst)

    st_writer.close()  # Close the last sub file.

    logger.debug("split_file out:\n%s" % "\n".join(out_file_list))
    return out_file_list


[docs]def merge_files(file_list,
                sort_criteria,
                out_file_name,
                remove_file_list=True,
                sort_file_list=False,
                dir=None):
    """
    Combines pre-ordered structure files by their property values.
    Input files are assumed to be sorted by default.  Optionally the
    files can be sorted by the sort_criteria prior to merging by setting
    sort_file_list=True.

    :note:  This function is not suited for handling pose viewer
        files because all receptors will be included in the output.
        See `merge_pv_files`.

    :param file_list:
            List of paths for the structure files that will be merged.
    :type file_list:
            list

    :param sort_criteria:
            List of (m2io dataname, module constant) tuples, which are the
            primary keys for sorting the structures.
    :type sort_criteria:
            list

    :param out_file_name:
            Path to the structure output file containing all the merged
            structures.
    :type out_file_name:
            string

    :param remove_file_list:
            If True then the file names in file_list are removed from disk.
    :type remove_file_list:
            boolean

    :param sort_file_list:
            If True, then prior to merging, sort the files by 'sort_criteria'.
            Default is False, assume the file_list members are already sorted.
    :type sort_file_list:
            boolean

    :param dir:
            Unused parameter.

    """

    st_writer = structure.StructureWriter(out_file_name)

    # Sort the individual files if needed.
    if sort_file_list:
        logger.debug("merge_files: sorting input file list.")
        for file_name in file_list:
            sort_file(file_name, sort_criteria)

    # Create a list of MaestroTextReader iterators for merging.
    st_iters = []
    for file_name in file_list:
        st_iters.append(structure.MaestroTextReader(file_name))

    merge_st_iters(st_iters, sort_criteria, st_writer)

    # Done writing structures.
    for iter in st_iters:
        iter.close()
    st_writer.close()

    # Perform file cleanup if requested.
    if remove_file_list:
        for file_name in file_list:
            try:
                fileutils.force_remove(file_name)
            except Exception as e:
                logger.info(
                    "merge_files:  failed to remove files: {}: Error{}".format(
                        file_name, e))
    return


[docs]def merge_pv_files(file_list, sort_criteria, out_file_name):
    """
    Combines pre-ordered pose viewer structure files by their property
    values.  Input files are assumed to be ordered.  Only the receptor
    from the first pose viewer file is retained.

    file_list (list)
        List of paths for the pose viewer files that will be merged.

    sort_criteria (list of tuples)
        List of (m2io dataname, module constant) tuples, which are the
        primary keys for sorting the ligand structures.

    out_file_name (string)
        Path to the structure output file containing all the merged
        structures.

    """

    # Add the receptor from the first pv file.
    st_writer = structure.StructureWriter(out_file_name)
    st_writer.append(next(structure.MaestroTextReader(file_list[0])))

    st_iters = []
    for file_name in file_list:
        st_iters.append(structure.MaestroTextReader(file_name, index=2))

    # Merge the structure streams with output appended to the writer.
    merge_st_iters(st_iters, sort_criteria, st_writer)

    # Done writing structures.
    st_writer.close()
    return


[docs]def merge_st_iters(structure_iters, sort_criteria, output_handle):
    """
    Combines pre-ordered structure iterators by their property values.

    :param structure_iters:
            List of iterables that emit structure.  Emitted structures can be
            a full structure, a MaestroText structure, or some other object
            with a property dictionary.
    :type: structure_iters:
            list

    :param sort_criteria:
            List of (m2io dataname, module constant) tuples, which are the
            primary keys for sorting the structures.
    :type sort_criteria:
            list

    :param output_handle:
            Output stream to which the sorted structures are appended.
    :type output_handle:
            An object with an append() method.

    """
    key_gen = _CriteriaKeyGenerator(sort_criteria)
    for st in heapq.merge(*structure_iters, key=key_gen.structKey):
        output_handle.append(st)


[docs]def sort_file_in_memory(file_name,
                        sort_criteria,
                        out_file_name=None,
                        intra_block_sort_criteria=None):
    """
    Orders the structures in file_name, keeping structures in memory
    during the sort operation.

    :param file_name: Path to file upon which to operate.
    :type file_name: str

    :param sort_criteria: List of (m2io dataname, module constant) tuples,
        which are the primary keys for sorting the structures and optional sort
        order constants.
    :type sort_criteria: List of tuples

    :param out_file_name: Output structure file containing the sorted
        structures. If out_file_name is None then the input file_name is
        clobbered with the sorted results.
    :type out_file_name: str or None

    :param intra_block_sort_criteria: List of (m2io dataname, module constant)
        tuples, which are the properties for sorting the structures within
        groups, and optional sort order constants.
    :type intra_block_sort_criteria: List of tuples or None
    """

    logger.debug(
        "sort_file_in_memory: file %s, output %s, sort_criteria %s, intra_block_sort_criteria %s"
        % (file_name, out_file_name, sort_criteria, intra_block_sort_criteria))

    sf_sorter = StructureFileSorter(file_name,
                                    1,
                                    sort_criteria,
                                    intra_block_sort_criteria,
                                    keep_structures=True)
    sf_sorter.sort()
    sf_sorter.write(out_file_name, sf_sorter.structure_index_order)

    return


def _get_temp_file_name(dir=None, suffix=MKTMPSUFFIX):
    """
    Returns the path to a new temporary file that is safe to append
    structures to.

    dir (string)
        Path to a directory with write permissions, where temporary
        files can be created.  Default is None, use the tempfile default,
        which appears to be /tmp.

    suffix (string)
        Optional suffix for temporary files.  Default is module constant
        MKTMPSUFFIX.

    """

    # Create a temp file for the intermediate results.
    (temp_file_desc, temp_file_name) = tempfile.mkstemp(
        #"sort_tmp_UNIQUEGIBBERISH.mae"
        suffix=suffix,
        prefix='sort_tmp_',
        dir=dir,
        text=True)
    os.close(temp_file_desc)
    logger.debug("_get_temp_file_name:  %s." % temp_file_name)

    return temp_file_name


################################################################################
# Classes
################################################################################
[docs]class StructureFileSorter(object):
    """
    A class to sort structure files by ct-level property values.

    API Example::

        glide_sp_pv_sorter = sort.StructureFileSorter(
            file_name = 'foo_pv.mae',
            file_index = 2
        )
        glide_sp_pv_sorter.sort()
        glide_sp_pv_sorter.writeTopNFromBlock('bar_lib.mae', 2)

        st_sorter = sort.StructureFileSorter(
            file_name = "baz.mae",
            sort_criteria = [
                ('r_prop_one', sort.ASCENDING),
                ('i_prop_two', sort.DESCENDING)
            ]
        )
        st_sorter.sort()
        st_sorter.write('baz-sorted.mae')


    :vartype structure_index_order: list
    :ivar structure_index_order: Sorted structure index order. A list of the
                                 original file indexes, in the order they
                                 appear when sorted by sort_criteria and
                                 intra_block_sort_criteria.
    :vartype structure_dict: dict
    :ivar structure_dict: File index keys for ct-level property dictionary.

    :vartype structure_block_order: list
    :ivar structure_block_order: Block_ids sorted by 'sort_criteria' keys.

    :vartype structure_count: int
    :ivar structure_count: The number of structures in the file.

    :vartype read_forward_quota: int
    :ivar read_forward_quota: Sort in batches, with this chunk size, instead of
                              with random-access. If the value evaluate as
                              True, the input file is read, forward-only, in
                              small chunks that are sorted in memory. Default
                              is 0, use random-access.


    An instance is primarily a data structure where the original file
    positions are keys for the dictionary of properties.  It has auxiliary
    data structures for tracking the sorted order of the original
    file positions, and methods to write output files with that order.

    Using random-access to re-read the structures in the proper
    order is typically faster than re-reading in batches.  However,
    read_forward_quota attribute can be set to a positive integer to
    force batch re-read/writing.

    """

[docs]    def __init__(self,
                 file_name=None,
                 file_index=1,
                 sort_criteria=GLIDE_SP_KEY_1,
                 intra_block_sort_criteria=None,
                 keep_structures=False):
        """
        Loads only the structure properties used to sort the file into
        a dictionary (keyed by file index), but does not do any sorting.

        :param file_name: Path to the structure file upon with to operate.
        :type file_name: str

        :param file_index: File position at which to start reading file_name.
        :type file_index: int

        :param sort_criteria: List of m2io datanames and module constant tuples
            that identify the values for sorting and the sort order.
        :type sort_criteria: list(tuple)

        :param intra_block_sort_criteria: List of m2io datanames and module
            constant tuples that identify the values for group sorting, and the
            sort order. If None, then a simple multi-key sort is performed
            using the 'sort_criteria'.
        :type intra_block_sort_criteria: list(tuple) or None

        :param keep_structures: If true then a reference to each structure is
            kept, keyed by '_structure'.  The default is False, don't keep
            references to the structures.
        :type keep_structures: bool
        """

        self.file_name = file_name
        self.file_index = file_index
        self.sort_criteria = sort_criteria
        self.intra_block_sort_criteria = intra_block_sort_criteria
        self.keep_structures = keep_structures
        self.read_forward_quota = 0

        # Final sorted structure index order.
        self.structure_index_order = []

        # Order of block_ids by order.
        self.structure_block_order = []

        # Store structure properties needed to sort by initial file
        # position index.
        self.structure_dict = {}
        sort_prop_keys = set()
        for prop, prop_sort_order in self.sort_criteria:
            sort_prop_keys.add(prop)

        if self.intra_block_sort_criteria:
            for prop, prop_sort_order in self.intra_block_sort_criteria:
                sort_prop_keys.add(prop)

        reader = structure.MaestroReader(self.file_name, self.file_index)
        for index, tst in enumerate(reader):
            # Save the actual file index for bookkeeping.
            index += self.file_index

            # Save only the relevant properties to conserve memory.
            self.structure_dict[index] = {}
            for prop in sort_prop_keys:
                self.structure_dict[index][prop] = tst.property.get(prop)

            # Stash a reference to the structure if requested.
            if self.keep_structures:
                self.structure_dict[index]['_structure'] = tst

            # Stash the offset to this structure for retrieval.
            offset = reader.last_position
            self.structure_dict[index]['_offset'] = offset

        self.structure_count = len(self.structure_dict[index])
        return

[docs]    def sort(self):
        """
        Organizes the data structure by self.sort_criteria, and
        self.intra_block_sort_criteria if it is not None.  Assigns
        attributes for the correct sorted order of the original file
        positions.

        """

        # Reset data structures as appendable lists.
        self.structure_index_order = []
        self.structure_block_order = []

        # Block sort first if we have auxiliary keys.
        if self.intra_block_sort_criteria:
            logger.debug("StructureFileSorter:  intra-block sorting.")

            key_gen = _CriteriaKeyGenerator(self.intra_block_sort_criteria)

            # Organize the blocks.
            # Principle block key (dataname, order).
            block_key = self.intra_block_sort_criteria[0]

            def key_for_file(file_index):
                return key_gen.propKey(self.structure_dict[file_index])

            # Dicts are block IDs, values are file indices
            self.structure_block_dict = {}
            for file_index in sorted(self.structure_dict, key=key_for_file):
                # Use string hash keys
                block_id = str(self.structure_dict[file_index].get(
                    block_key[0]))
                self.structure_block_dict.setdefault(block_id, [])
                self.structure_block_dict[block_id].append(file_index)

            # Order the blocks, decorate leading member with order keys.
            file_indices = []
            for block_id in self.structure_block_dict:
                file_index = self.structure_block_dict[block_id][0]
                file_indices.append(file_index)

            key_gen = _CriteriaKeyGenerator(self.sort_criteria)

            # Order the block id's by order keys.
            logger.debug("StructureFileSorter:  ordering blocks.")

            # Build/rebuild auxiliary list of block order.
            def key_for_block(block_id):
                file_index = self.structure_block_dict[block_id][0]
                return key_gen.propKey(self.structure_dict[file_index])

            for block_id in sorted(self.structure_block_dict,
                                   key=key_for_block):
                self.structure_block_order.append(block_id)

            # Flatten file index blocks in the sorted order.
            logger.debug("StructureFileSorter:  flattening blocks.")
            for block_id in self.structure_block_order:
                for file_index in self.structure_block_dict[block_id]:
                    self.structure_index_order.append(file_index)

        # Generic multi-key sort.
        else:

            logger.debug("StructureFileSorter:  general multi-key sorting.")
            # Decorate.

            key_gen = _CriteriaKeyGenerator(self.sort_criteria)

            def key_for_file(file_index):
                return key_gen.propKey(self.structure_dict[file_index])

            for file_index in sorted(self.structure_dict, key=key_for_file):
                self.structure_index_order.append(file_index)

        return

[docs]    def write(self, out_file_name, index_list=None, dir=None):
        """
        Writes structures to disk, no return value.

        out_file_name (str)
            Path to the output structure file.

        index_list (list)
            List of file indexes to write, in the order that they
            should appear in the output file (typically a slice
            of self.structure_index_order).  If None, then all of
            self.structure_index_order is written.

        dir (string)
            Path to the directory where the intermediate file is written.
            The default is the runtime current working directory.
            There needs to be enough space to store effectively a copy
            of file_name.  For really large files, /tmp is not a good
            location for most hosts.

        """

        if not index_list:
            index_list = self.structure_index_order[:]

        # Create a temp file and writer for output.
        if dir is None:
            dir = os.getcwd()
        file_suffix = fileutils.splitext(out_file_name)[1]
        temp_file_name = _get_temp_file_name(dir=dir, suffix=file_suffix)
        st_writer = structure.StructureWriter(temp_file_name)

        # Use the structures in memory.
        if self.keep_structures:
            logger.debug("StructureFileSorter:  write from data structure.")
            for sorted_file_index in index_list:
                tst = self.structure_dict[sorted_file_index]['_structure']
                st_writer.append(tst)

        # Re-read from disk in batches.
        elif self.read_forward_quota:
            while index_list:
                i_slice = []
                for i in range(1, self.read_forward_quota):
                    try:
                        i_slice.append(index_list.pop(0))
                    except IndexError:
                        break  # exit for loop

                logger.debug("StructureFileSorter: sorting slice:\n %s" %
                             i_slice)
                mae_text_reader = structure.MaestroTextReader(self.file_name)

                decorated_text_sts = []
                for file_index, tst in enumerate(mae_text_reader):
                    file_index += self.file_index
                    if file_index in i_slice:
                        item = (i_slice.index(file_index), tst)
                        decorated_text_sts.append(item)
                decorated_text_sts.sort()
                for _, tst in decorated_text_sts:
                    st_writer.append(tst)

        # Re-read from disk, random access.
        else:
            logger.debug("StructureFileSorter: re-read and write from disk.")
            offsets = []
            for index in index_list:
                offsets.append(self.structure_dict[index]['_offset'])
            reader = structure.MaestroReader(self.file_name)
            for offset in offsets:
                st = reader.read(position=offset)
                st_writer.append(st)

        # Done writing structures.
        st_writer.close()

        # Clobber the out file name with the temp file.
        shutil.move(temp_file_name, out_file_name)

        return

[docs]    def writeTopNFromBlock(self,
                           out_file_name="",
                           max_per_block=1,
                           max_num_block=None):
        """
        Write the first max_per_block structures from each block to the
        output file.

        out_file_name (string)
            Name of structure file to write.

        max_per_block (int)
            Number of leading members, from each block, to write to
            out_file_name.  Default is 1.

        max_num_block (int)
            Number of blocks from which to draw leading members.  If the
            value is None then N max_per_block structure are pulled from
            each block.  Otherwise, the top N max_per_block strucutures
            from just the top M max_num_blocks blocks are written.


        """

        # Container for indexes.
        index_list = []

        # Set the number of items to grab.
        N = max_per_block
        if max_num_block is None:
            M = len(self.structure_block_order)
        else:
            M = max_num_block

        # For the top M blocks, get the file indexes of the top N per block.
        for block_id in self.structure_block_order[:M]:
            index_list.extend(self.structure_block_dict[block_id][:N])

        logger.debug("StructureFileSorter: writeTopNFromBlock %d per block" % N)
        logger.debug("StructureFileSorter: writeTopNFromBlock %d blocks" % M)
        logger.debug("StructureFileSorter: writeTopNFromBlock indexes:\n")
        logger.debug(index_list)

        # Write them out.
        self.write(out_file_name, index_list)

        return


class _CriteriaKeyGenerator(object):
    """
    Class that provides methods for generating sort "keys" for structures
    or property dictionaries, given a sort criteria.
    """

    def __init__(self, sort_criteria):
        """
        :param sort_criteria:
            List of m2io datanames and module constant tuples that
            identify the values for sorting and the sort order.
            Default is GLIDE_SP_KEY_1.
        :type sort_criteria: list of tuples
        """
        self.sort_criteria = sort_criteria

    @classmethod
    def _valueKey(cls, value, order=ASCENDING):
        """
        Generate a sort key for the given value. Value None, which represents
        a missing property value for a CT, is always sorted last, whether the
        order is ascending or descending.

        :param value: Value to sort
        :type value: int, float, str, or bool

        :param order: Sort order
        :type order: int

        :return: Sort key
        :rtype: tuple
        """
        if value is None:
            # Always place None values last (both ascending and descending):
            key = (True, None)
        else:
            if order == DESCENDING:
                if isinstance(value, str):
                    value = [-ord(char) for char in value]
                elif isinstance(value, bool):
                    value = not value
                else:
                    # int or float
                    value = -value
            else:
                assert order == ASCENDING
            key = (False, value)
        return key

    def structKey(self, st):
        """
        Returns a sort key for the given structure, based on sort criteria.

        :param st: Structure to process
        :type st: structure.Structure

        :return: Sort key
        :rtype: tuple
        """
        return self.propKey(st.property)

    def propKey(self, prop_dict):
        """
        Returns a sort key for the given property dict, based on sort criteria.

        :param prop_dict: Dictionary of properties and values
        :type prop_dict: dict

        :return: Sort key
        :rtype: tuple
        """
        item = []
        for prop_name, order in self.sort_criteria:
            value = prop_dict.get(prop_name)
            key = self._valueKey(value, order)
            item.append(key)
        return item


################################################################################
# Main
################################################################################
# Command line wrapper for the module's sort() function.
[docs]def main():
    (opts, args) = parse_arguments()
    if opts.verbose:
        logger.setLevel(log.logging.DEBUG)

    # Check input files, make sure they really exist.
    file_list = []
    for file_name in args:
        if not os.path.isfile(file_name):
            print("Skipping %s.  File does not exist." % file_name)
        else:
            file_list.append(file_name)

    # Process key arguments.  Strip of key modifiers if they exist,
    # and assign order.
    key_1 = []
    for key_1_opt in opts.key_1:
        if key_1_opt.startswith('+'):
            key_1.append((key_1_opt.lstrip('+'), ASCENDING))
        elif key_1_opt.startswith('-'):
            key_1.append((key_1_opt.lstrip('-'), DESCENDING))
        else:
            key_1.append((key_1_opt, ASCENDING))

    key_2 = []
    if opts.key_2:
        for key_2_opt in opts.key_2:
            if key_2_opt.startswith('+'):
                key_2.append((key_2_opt.lstrip('+'), ASCENDING))
            elif key_2_opt.startswith('-'):
                key_2.append((key_2_opt.lstrip('-'), DESCENDING))
            else:
                key_2.append((key_2_opt, ASCENDING))

    # Sort or merge input.
    if opts.merge or opts.merge_cleanup:
        merge_files(file_list, key_1, opts.output, opts.merge_cleanup)

    else:
        for file_name in file_list:
            sort_file(file_name, key_1, opts.output, key_2)


[docs]def parse_arguments():
    # Get a dinger option parser, and populate it.
    script_usage = "\n$SCHRODINGER/run %prog -k <dataname> [options] <structure_file>... \n or \n$SCHRODINGER/run %prog -m -k <dataname> [options] <structure_file>... -o <output_structure_file>"
    script_desc = "Sort structure files by ct-level property values."
    parser = cmdline.SingleDashOptionParser(usage=script_usage,
                                            version_source=_version)
    parser.set_description(script_desc)

    parser.add_option(
        "-k",
        "-key_1",
        dest="key_1",
        action="append",
        help=
        """Add a property dataname by which to sort or merge the structures.  If multiple keys are given then they are applied in the order they appear on the command line.  e.g. '-k s_m_title -k r_i_glide_gscore',  title is the chief sort criterion, ties are broken by the Glide score, and so on."""
    )
    parser.add_option(
        "-K",
        "-key_2",
        dest="key_2",
        action="append",
        help=
        """'Block sort'-add a property dataname by which to organize related structures.  Blocks are ordered by the leading member's order with respect to -key_1.  e.g. '-key_2 s_m_title -key_2 r_i_glide_emodel' sorts such that structures with the same title are contigious and sorted by Emodel.  The contigious blocks are sorted by -key_1 value(s) from each block's first member's value(s)"""
    )
    parser.add_option(
        "-o",
        "-output",
        dest="output",
        default=None,
        help=
        'Name of merge output file, or optional file name for sorted output.  An output file is required for merging.  Sorting replaces the input file by default, but sorted output can be redirected to this file name'
    )
    parser.add_option(
        "-m",
        "-merge",
        dest="merge",
        default=False,
        action="store_true",
        help=
        'Merge the files with key_1 property values, keep the original files.')
    parser.add_option(
        "-M",
        "-merge_cleanup",
        dest="merge_cleanup",
        default=False,
        action="store_true",
        help=
        'Merge the files with key_1 property values, remove the original files.'
    )

    parser.add_option("-verbose",
                      dest="verbose",
                      default=False,
                      action="store_true",
                      help="Increased verbosity for debugging.")

    (opts, args) = parser.parse_args()

    # Check for required arguments.
    if (opts.merge or opts.merge_cleanup) and not opts.output:
        parser.error("Output file name required when merging.")

    if not opts.key_1:
        parser.error("One or more dataname property keys are required.")

    return (opts, args)


if __name__ == "__main__":
    main()
# EOF