Source code for schrodinger.test.hypothesis.strategies.proteins

import itertools
import string

from hypothesis import strategies

from schrodinger import structure
from schrodinger.protein import alignment
from schrodinger.protein import annotation
from schrodinger.protein import predictors
from schrodinger.protein import residue
from schrodinger.protein import sequence
from schrodinger.test.hypothesis.strategies import indices as indices_strats

_LETTERS_AND_NUMBERS = list(string.ascii_letters + string.digits)
_PROT_ANNOTATIONS = list(
    annotation.ProteinAlignmentAnnotations.ANNOTATION_TYPES)
_PROTEIN_ANN_COMBINATIONS = list(
    itertools.chain(*(itertools.combinations(_PROT_ANNOTATIONS, i)
                      for i in range(len(_PROT_ANNOTATIONS)))))
_NON_STD_AMINO_ACIDS = list(
    set(residue.get_protein_alphabet().values()).difference(
        residue.STD_AMINO_ACIDS))

RES_PROP_ANNOS = set(
    annotation.ProteinSequenceAnnotations.RES_PROPENSITY_ANNOTATIONS)
SEQ_ANNO_TYPES = set(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES) - RES_PROP_ANNOS
ALN_ANNO_TYPES = set(annotation.ProteinAlignmentAnnotations.ANNOTATION_TYPES)

# Listify the annotation types so hypothesis can shrink them.
RES_PROP_ANNOS = list(RES_PROP_ANNOS)
SEQ_ANNO_TYPES = list(SEQ_ANNO_TYPES)
ALN_ANNO_TYPES = list(ALN_ANNO_TYPES)

# These annotation types can't be enabled through the gui currently.
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.rescode)
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.isoelectric_point)
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.hydrophobicity)
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.sasa)
# Domain annotation fetches from a remote server
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.domains)


@strategies.composite
def _unique_lists_made_of(draw, elements):
    assert not isinstance(elements, set)
    return draw(strategies.lists(strategies.sampled_from(elements),
                                 unique=True))


[docs]@strategies.composite
def generated_aln_annotations(draw):
    return draw(_unique_lists_made_of(ALN_ANNO_TYPES))


[docs]@strategies.composite
def generated_seq_annotations(draw):
    return draw(_unique_lists_made_of(SEQ_ANNO_TYPES))


[docs]@strategies.composite
def generated_res_prop_annotations(draw):
    return draw(_unique_lists_made_of(RES_PROP_ANNOS))


[docs]@strategies.composite
def generated_annotation_lists(draw):
    """
    Returns a strategy providing a list of annotations
    """
    return draw(strategies.sampled_from(_PROTEIN_ANN_COMBINATIONS))


[docs]class AlignmentInfo:

[docs]    def __init__(self, seqs=(), cysteines_to_bond=(), anchor_residues=()):
        """
        An object that can be used in alignment tests

        This object can be used to create an alignment, along with fixtures
        adapted to the alignment that can be used in testing it. We use this
        object so that different tests can create different kinds of alignments
        (regular kinds and undoable alignments).

        :param seqs: List of sequences to be used in constructing an alignment
        :type seq: list
        """
        self.seqs = seqs or []
        self.cysteines_to_bond = cysteines_to_bond
        self.anchor_residues = anchor_residues

    def __repr__(self):
        """
        Not a strict repr, but rather a comprehensive summary of the alignment
        info object

        This is useful for hypothesis error messages. The Alignment repr can be
        copied and pasted into a repl for interactive debugging.
        """
        aln = alignment.ProteinAlignment(self.seqs)
        msg = [
            "\n", "AlignmentInfo object used to construct:",
            str(aln), "and containing the additional information:", "\n",
            "\nA repl-ready representation of the alignment: \n\n",
            repr(aln)
        ]
        return "\n".join(msg)


POSSIBLE_SECONDARY_STRUCTURES = (structure.SS_NONE, structure.SS_LOOP,
                                 structure.SS_HELIX, structure.SS_STRAND,
                                 structure.SS_TURN)


[docs]@strategies.composite
def generated_residues(draw, residue_types=None, gaps=True):
    """
    :param draw: A function supplied by hypothesis
    :type  draw: function

    :param residue_types: Alphabet to use for residues. By default, it will use
        95% standard and 5% nonstandard protein amino acids.
    :type  residue_types: list(residue.ElementType)

    :param gaps: Whether to include gaps in default residue types
    :type gaps: bool

    :return: A residue suitable for testing
    :rtype: schrodinger.protein.residue.Residue
    """
    if residue_types is None:
        # make residue types more likely to be standard
        i = draw(strategies.integers(min_value=0, max_value=20))
        if gaps and i == 0:
            residue_types = [None]
        elif i == 1:
            residue_types = _NON_STD_AMINO_ACIDS
        else:
            residue_types = residue.STD_AMINO_ACIDS
    res_type = draw(strategies.sampled_from(residue_types))

    if res_type is None:
        return residue.Gap()
    res = residue.Residue(res_type)
    res.temperature_factor = draw(strategies.floats(0, 99))
    res.secondary_structure = draw(
        strategies.sampled_from(POSSIBLE_SECONDARY_STRUCTURES))
    res.pred_secondary_structure = draw(
        strategies.sampled_from(POSSIBLE_SECONDARY_STRUCTURES))
    res.pred_accessibility = draw(
        strategies.sampled_from(list(predictors.SolventAccessibility) + [None]))
    res.pred_disordered = draw(
        strategies.sampled_from(list(predictors.Disordered) + [None]))
    res.pred_domain_arr = draw(
        strategies.sampled_from(list(predictors.DomainArrangement) + [None]))
    # Residue numbers can be negative
    res.resnum = draw(strategies.integers(-999, 9999))
    # The most common insertion codes are ' ', 'A', and 'B'
    inscodes = ' ' + string.ascii_letters
    res.inscode = draw(strategies.sampled_from(inscodes))
    return res


# For performance reasons, We use `simple_sequences` when generating alignments and
# `generated_sequences` for tests specifically testing `sequence`
# functionality. We hope to fix this in MSV-1537.
simple_sequences = strategies.builds(
    sequence.ProteinSequence,
    elements=strategies.lists(elements=generated_residues()),
    name=strategies.text(alphabet=_LETTERS_AND_NUMBERS),
    entry_id=strategies.text(alphabet=string.digits, min_size=1),
    chain=strategies.text(alphabet=string.ascii_letters, max_size=1),
    long_name=strategies.text(alphabet=_LETTERS_AND_NUMBERS))


[docs]@strategies.composite
def cysteine_pair_lists(draw, residues):
    """
    Given an iterable of residues, returns a list of tuples of cysteine pairs

    :param draw: A function supplied by hypothesis
    :type  draw: function

    :param residues: An iterable of residues
    :type residues: iterable

    :rtype: list(tuple)
    :return: A list of tuples of cysteine pairs
    """
    cysteines = [
        res for res in residues if res.is_res and res.type.name == "Cysteine"
    ]
    n_cysteines = len(cysteines)

    # Generate a number between 0 and N (number of pairs of cysteines)
    n_ss_bonds = draw(
        strategies.integers(min_value=0, max_value=n_cysteines // 2))

    draw(strategies.randoms()).shuffle(cysteines)
    pairs = []
    for _ in range(n_ss_bonds):
        res1, res2 = cysteines.pop(), cysteines.pop()
        pairs.append((res1, res2))
    return pairs


[docs]@strategies.composite
def generated_sequences(draw,
                        min_size=0,
                        max_size=None,
                        residue_types=None,
                        include_gaps=True,
                        add_cysteine_bonds=True):
    """
    :param draw: A function supplied by hypothesis
    :type  draw: function

    :param min_size: Minimum length for sequences
    :type  min_size: int

    :param max_size: Maximum number of residues to include in the sequence
    :type  max_size: int

    :param residue_types: Alphabet to use for residues
    :type  residue_types: list(residue.ElementType)

    :param include_gaps: Whether to generate gaps in the sequence
    :type  include_gaps: bool

    :return: A protein sequence suitable for testing
    :rtype: schrodinger.protein.sequence.ProteinSequence
    """
    elements = draw(
        strategies.lists(elements=generated_residues(
            residue_types=residue_types, gaps=include_gaps),
                         min_size=min_size,
                         max_size=max_size))
    name = draw(strategies.text(alphabet=_LETTERS_AND_NUMBERS))
    entry_id = str(draw(strategies.integers(min_value=1, max_value=1000)))
    chain = draw(strategies.text(alphabet=string.ascii_letters, max_size=1))
    seq = sequence.ProteinSequence(elements,
                                   name=name,
                                   entry_id=entry_id,
                                   chain=chain,
                                   structure_chain=chain)

    structureless = draw(indices_strats.index_lists(seq))
    for idx in structureless:
        seq[idx].seqres_only = True

    if add_cysteine_bonds:
        cysteines = draw(cysteine_pair_lists(seq))
        for res1, res2 in cysteines:
            known = draw(strategies.booleans())
            residue.add_disulfide_bond(res1, res2, known=known)

    return seq


[docs]@strategies.composite
def generated_multichain_sequences(draw,
                                   min_chain_size=0,
                                   max_chain_size=None,
                                   min_num_chains=1,
                                   max_num_chains=None,
                                   residue_types=None,
                                   include_gaps=True):
    """
    Generates multiple sequences that represent different chains of a single
    protein.

    :param draw: A function supplied by hypothesis
    :type  draw: function

    :param min_chain_size: The minimum length of each chain's sequence
    :type  min_chain_size: int

    :param max_chain_size: The maximum length of each chain's sequence
    :type  max_chain_size: int

    :param min_num_chains: The minimum number of chains in the protein.  Must be
        positive.
    :type  min_size: int

    :param max_num_chains: The maximum number of chains in the protein.  Must be
        less than or equal to 62 since each chain needs a unique
        single-character name.
    :type  max_size: int

    :param residue_types: Alphabet to use for residues
    :type  residue_types: list(residue.ElementType)

    :param include_gaps: Whether to generate gaps in the sequence
    :type  include_gaps: bool

    :return: The generated sequences
    :rtype: list[schrodinger.protein.sequence.ProteinSequence[
    """
    name = draw(strategies.text(alphabet=_LETTERS_AND_NUMBERS))
    entry_id = str(draw(strategies.integers(min_value=1, max_value=1000)))
    assert min_num_chains >= 1
    if max_num_chains is None:
        max_num_chains = len(_LETTERS_AND_NUMBERS)
    else:
        assert max_num_chains <= len(_LETTERS_AND_NUMBERS)
    chain_names = draw(
        strategies.lists(elements=strategies.sampled_from(_LETTERS_AND_NUMBERS),
                         min_size=min_num_chains,
                         max_size=max_num_chains,
                         unique=True))
    seqs = []
    for cur_chain_name in chain_names:
        elements = draw(
            strategies.lists(elements=generated_residues(
                residue_types=residue_types, gaps=include_gaps),
                             min_size=min_chain_size,
                             max_size=max_chain_size))
        cur_seq = sequence.ProteinSequence(elements,
                                           name=name,
                                           entry_id=entry_id,
                                           chain=cur_chain_name)
        for idx in draw(indices_strats.index_lists(cur_seq)):
            cur_seq[idx].seqres_only = True
        seqs.append(cur_seq)
    return seqs


[docs]@strategies.composite
def alignment_infos(draw,
                    include_interseq_ss_bonds=True,
                    include_anchor_residues=False,
                    **kwargs):
    """
    Returns everything we need to create an alignment. Takes in `kwargs` to
    pass to `strategies.lists()`.

    We return an alignment_info instead of an alignment in order to allow tests
    to create different kinds of alignments and also to pass along additional
    test data customized to the alignment that the test will create.

    :return: A test fixture for alignment tests
    :rtype: AlignmentInfo
    """

    seqs = draw(strategies.lists(simple_sequences, **kwargs))

    anchor_residues = []
    if include_anchor_residues and seqs:
        ref_seq = seqs[0]
        anchorable_residues = []
        for seq in seqs[1:]:
            for res, ref_res in zip(seq, ref_seq):
                if not res.is_gap and not ref_res.is_gap:
                    anchorable_residues.append(res)
        if anchorable_residues:
            anchor_residues = draw(indices_strats.sublists(anchorable_residues))

    cysteines_to_bond = []
    if include_interseq_ss_bonds:
        cysteines_to_bond = draw(cysteine_pair_lists(itertools.chain(*seqs[:])))

    aln_info = AlignmentInfo(seqs=seqs,
                             cysteines_to_bond=cysteines_to_bond,
                             anchor_residues=anchor_residues)
    return aln_info