Source code for schrodinger.test.hypothesis.strategies.proteins

import itertools
import string

from hypothesis import strategies

from schrodinger import structure
from schrodinger.protein import alignment
from schrodinger.protein import annotation
from schrodinger.protein import predictors
from schrodinger.protein import residue
from schrodinger.protein import sequence
from schrodinger.test.hypothesis.strategies import indices as indices_strats

_LETTERS_AND_NUMBERS = list(string.ascii_letters + string.digits)
_PROT_ANNOTATIONS = list(
    annotation.ProteinAlignmentAnnotations.ANNOTATION_TYPES)
_PROTEIN_ANN_COMBINATIONS = list(
    itertools.chain(*(itertools.combinations(_PROT_ANNOTATIONS, i)
                      for i in range(len(_PROT_ANNOTATIONS)))))
_NON_STD_AMINO_ACIDS = list(
    set(residue.get_protein_alphabet().values()).difference(
        residue.STD_AMINO_ACIDS))

RES_PROP_ANNOS = set(
    annotation.ProteinSequenceAnnotations.RES_PROPENSITY_ANNOTATIONS)
SEQ_ANNO_TYPES = set(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES) - RES_PROP_ANNOS
ALN_ANNO_TYPES = set(annotation.ProteinAlignmentAnnotations.ANNOTATION_TYPES)

# Listify the annotation types so hypothesis can shrink them.
RES_PROP_ANNOS = list(RES_PROP_ANNOS)
SEQ_ANNO_TYPES = list(SEQ_ANNO_TYPES)
ALN_ANNO_TYPES = list(ALN_ANNO_TYPES)

# These annotation types can't be enabled through the gui currently.
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.rescode)
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.isoelectric_point)
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.hydrophobicity)
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.sasa)
# Domain annotation fetches from a remote server
SEQ_ANNO_TYPES.remove(
    annotation.ProteinSequenceAnnotations.ANNOTATION_TYPES.domains)


@strategies.composite
def _unique_lists_made_of(draw, elements):
    assert not isinstance(elements, set)
    return draw(strategies.lists(strategies.sampled_from(elements),
                                 unique=True))


[docs]@strategies.composite def generated_aln_annotations(draw): return draw(_unique_lists_made_of(ALN_ANNO_TYPES))
[docs]@strategies.composite def generated_seq_annotations(draw): return draw(_unique_lists_made_of(SEQ_ANNO_TYPES))
[docs]@strategies.composite def generated_res_prop_annotations(draw): return draw(_unique_lists_made_of(RES_PROP_ANNOS))
[docs]@strategies.composite def generated_annotation_lists(draw): """ Returns a strategy providing a list of annotations """ return draw(strategies.sampled_from(_PROTEIN_ANN_COMBINATIONS))
[docs]class AlignmentInfo:
[docs] def __init__(self, seqs=(), cysteines_to_bond=(), anchor_residues=()): """ An object that can be used in alignment tests This object can be used to create an alignment, along with fixtures adapted to the alignment that can be used in testing it. We use this object so that different tests can create different kinds of alignments (regular kinds and undoable alignments). :param seqs: List of sequences to be used in constructing an alignment :type seq: list """ self.seqs = seqs or [] self.cysteines_to_bond = cysteines_to_bond self.anchor_residues = anchor_residues
def __repr__(self): """ Not a strict repr, but rather a comprehensive summary of the alignment info object This is useful for hypothesis error messages. The Alignment repr can be copied and pasted into a repl for interactive debugging. """ aln = alignment.ProteinAlignment(self.seqs) msg = [ "\n", "AlignmentInfo object used to construct:", str(aln), "and containing the additional information:", "\n", "\nA repl-ready representation of the alignment: \n\n", repr(aln) ] return "\n".join(msg)
POSSIBLE_SECONDARY_STRUCTURES = (structure.SS_NONE, structure.SS_LOOP, structure.SS_HELIX, structure.SS_STRAND, structure.SS_TURN)
[docs]@strategies.composite def generated_residues(draw, residue_types=None, gaps=True): """ :param draw: A function supplied by hypothesis :type draw: function :param residue_types: Alphabet to use for residues. By default, it will use 95% standard and 5% nonstandard protein amino acids. :type residue_types: list(residue.ElementType) :param gaps: Whether to include gaps in default residue types :type gaps: bool :return: A residue suitable for testing :rtype: schrodinger.protein.residue.Residue """ if residue_types is None: # make residue types more likely to be standard i = draw(strategies.integers(min_value=0, max_value=20)) if gaps and i == 0: residue_types = [None] elif i == 1: residue_types = _NON_STD_AMINO_ACIDS else: residue_types = residue.STD_AMINO_ACIDS res_type = draw(strategies.sampled_from(residue_types)) if res_type is None: return residue.Gap() res = residue.Residue(res_type) res.temperature_factor = draw(strategies.floats(0, 99)) res.secondary_structure = draw( strategies.sampled_from(POSSIBLE_SECONDARY_STRUCTURES)) res.pred_secondary_structure = draw( strategies.sampled_from(POSSIBLE_SECONDARY_STRUCTURES)) res.pred_accessibility = draw( strategies.sampled_from(list(predictors.SolventAccessibility) + [None])) res.pred_disordered = draw( strategies.sampled_from(list(predictors.Disordered) + [None])) res.pred_domain_arr = draw( strategies.sampled_from(list(predictors.DomainArrangement) + [None])) # Residue numbers can be negative res.resnum = draw(strategies.integers(-999, 9999)) # The most common insertion codes are ' ', 'A', and 'B' inscodes = ' ' + string.ascii_letters res.inscode = draw(strategies.sampled_from(inscodes)) return res
# For performance reasons, We use `simple_sequences` when generating alignments and # `generated_sequences` for tests specifically testing `sequence` # functionality. We hope to fix this in MSV-1537. simple_sequences = strategies.builds( sequence.ProteinSequence, elements=strategies.lists(elements=generated_residues()), name=strategies.text(alphabet=_LETTERS_AND_NUMBERS), entry_id=strategies.text(alphabet=string.digits, min_size=1), chain=strategies.text(alphabet=string.ascii_letters, max_size=1), long_name=strategies.text(alphabet=_LETTERS_AND_NUMBERS))
[docs]@strategies.composite def cysteine_pair_lists(draw, residues): """ Given an iterable of residues, returns a list of tuples of cysteine pairs :param draw: A function supplied by hypothesis :type draw: function :param residues: An iterable of residues :type residues: iterable :rtype: list(tuple) :return: A list of tuples of cysteine pairs """ cysteines = [ res for res in residues if res.is_res and res.type.name == "Cysteine" ] n_cysteines = len(cysteines) # Generate a number between 0 and N (number of pairs of cysteines) n_ss_bonds = draw( strategies.integers(min_value=0, max_value=n_cysteines // 2)) draw(strategies.randoms()).shuffle(cysteines) pairs = [] for _ in range(n_ss_bonds): res1, res2 = cysteines.pop(), cysteines.pop() pairs.append((res1, res2)) return pairs
[docs]@strategies.composite def generated_sequences(draw, min_size=0, max_size=None, residue_types=None, include_gaps=True, add_cysteine_bonds=True): """ :param draw: A function supplied by hypothesis :type draw: function :param min_size: Minimum length for sequences :type min_size: int :param max_size: Maximum number of residues to include in the sequence :type max_size: int :param residue_types: Alphabet to use for residues :type residue_types: list(residue.ElementType) :param include_gaps: Whether to generate gaps in the sequence :type include_gaps: bool :return: A protein sequence suitable for testing :rtype: schrodinger.protein.sequence.ProteinSequence """ elements = draw( strategies.lists(elements=generated_residues( residue_types=residue_types, gaps=include_gaps), min_size=min_size, max_size=max_size)) name = draw(strategies.text(alphabet=_LETTERS_AND_NUMBERS)) entry_id = str(draw(strategies.integers(min_value=1, max_value=1000))) chain = draw(strategies.text(alphabet=string.ascii_letters, max_size=1)) seq = sequence.ProteinSequence(elements, name=name, entry_id=entry_id, chain=chain, structure_chain=chain) structureless = draw(indices_strats.index_lists(seq)) for idx in structureless: seq[idx].seqres_only = True if add_cysteine_bonds: cysteines = draw(cysteine_pair_lists(seq)) for res1, res2 in cysteines: known = draw(strategies.booleans()) residue.add_disulfide_bond(res1, res2, known=known) return seq
[docs]@strategies.composite def generated_multichain_sequences(draw, min_chain_size=0, max_chain_size=None, min_num_chains=1, max_num_chains=None, residue_types=None, include_gaps=True): """ Generates multiple sequences that represent different chains of a single protein. :param draw: A function supplied by hypothesis :type draw: function :param min_chain_size: The minimum length of each chain's sequence :type min_chain_size: int :param max_chain_size: The maximum length of each chain's sequence :type max_chain_size: int :param min_num_chains: The minimum number of chains in the protein. Must be positive. :type min_size: int :param max_num_chains: The maximum number of chains in the protein. Must be less than or equal to 62 since each chain needs a unique single-character name. :type max_size: int :param residue_types: Alphabet to use for residues :type residue_types: list(residue.ElementType) :param include_gaps: Whether to generate gaps in the sequence :type include_gaps: bool :return: The generated sequences :rtype: list[schrodinger.protein.sequence.ProteinSequence[ """ name = draw(strategies.text(alphabet=_LETTERS_AND_NUMBERS)) entry_id = str(draw(strategies.integers(min_value=1, max_value=1000))) assert min_num_chains >= 1 if max_num_chains is None: max_num_chains = len(_LETTERS_AND_NUMBERS) else: assert max_num_chains <= len(_LETTERS_AND_NUMBERS) chain_names = draw( strategies.lists(elements=strategies.sampled_from(_LETTERS_AND_NUMBERS), min_size=min_num_chains, max_size=max_num_chains, unique=True)) seqs = [] for cur_chain_name in chain_names: elements = draw( strategies.lists(elements=generated_residues( residue_types=residue_types, gaps=include_gaps), min_size=min_chain_size, max_size=max_chain_size)) cur_seq = sequence.ProteinSequence(elements, name=name, entry_id=entry_id, chain=cur_chain_name) for idx in draw(indices_strats.index_lists(cur_seq)): cur_seq[idx].seqres_only = True seqs.append(cur_seq) return seqs
[docs]@strategies.composite def alignment_infos(draw, include_interseq_ss_bonds=True, include_anchor_residues=False, **kwargs): """ Returns everything we need to create an alignment. Takes in `kwargs` to pass to `strategies.lists()`. We return an alignment_info instead of an alignment in order to allow tests to create different kinds of alignments and also to pass along additional test data customized to the alignment that the test will create. :return: A test fixture for alignment tests :rtype: AlignmentInfo """ seqs = draw(strategies.lists(simple_sequences, **kwargs)) anchor_residues = [] if include_anchor_residues and seqs: ref_seq = seqs[0] anchorable_residues = [] for seq in seqs[1:]: for res, ref_res in zip(seq, ref_seq): if not res.is_gap and not ref_res.is_gap: anchorable_residues.append(res) if anchorable_residues: anchor_residues = draw(indices_strats.sublists(anchorable_residues)) cysteines_to_bond = [] if include_interseq_ss_bonds: cysteines_to_bond = draw(cysteine_pair_lists(itertools.chain(*seqs[:]))) aln_info = AlignmentInfo(seqs=seqs, cysteines_to_bond=cysteines_to_bond, anchor_residues=anchor_residues) return aln_info