Source code for schrodinger.protein.tasks.muscle

# Runs MUSCLE alignment jobs
# This module provides an interface similar to clustal.py's.
# They might be merged in the future to reduce code duplication

import copy
import glob
import os
import tempfile

from schrodinger.application.msv import seqio
from schrodinger.protein.tasks import clustal
from schrodinger.utils import fileutils


[docs]def get_muscle_path(): """ Returns the path to the MUSCLE executable. :rtype: str :return: path to MUSCLE executable file, or None if the executable could not be located. """ pattern = os.path.join(fileutils.get_mmshare_dir(), "bin", "*", "muscle*") results = glob.glob(pattern) if results: return results[0] print("No MUSCLE exec found!") return None
[docs]class MuscleJob(clustal.AbstractAlignmentJob): """ Runs alignment via MUSCLE program :note: Muscle by default rearranges the order to put similar sequences together """
[docs] def __init__(self, aln, second_aln=None, quiet=True, maintain_order=True): """ :param aln: Input sequences :type aln: ProteinAlignment :param second_aln: Second alignment for aligning multiple MSAs :type second_aln: ProteinAlignment :param maintain_order: Whether to maintain input order of sequences :type maintain_order: bool Dev note: - Unimplemented features: -- Speedup options """ super().__init__(aln, second_aln) self.quiet = quiet self.maintain_order = maintain_order
[docs] def run(self): """ Run the MUSCLE alignment. :return: Aligned sequences :rtype: ProteinAlignment """ muscle_exe = get_muscle_path() if not muscle_exe: raise RuntimeError("Could not locate MUSCLE executable!") to_write = self.aln if self.second_aln: # Pad alignments to prevent muscle from adding trailing X self.aln.padAlignment() self.second_aln.padAlignment() else: # When only one input alignment is passed to Muscle, it ignores # gap-only sequences and excludes them from the output. To work # around this, we remove these sequences and manually add them back # to the output alignment. empty_seq_indices = [ i for i, seq in enumerate(self.aln) if not seq.getGaplessLength() ] if empty_seq_indices: to_write = copy.deepcopy(self.aln) empty_seqs = [to_write[i] for i in empty_seq_indices] to_write.removeSeqs(empty_seqs) tmpdir_obj = tempfile.TemporaryDirectory() with tmpdir_obj as tmpdir: inp_fname = os.path.join(tmpdir, 'input.fasta') inp_fname2 = None out_fname = os.path.join(tmpdir, 'output.fasta') if self.second_aln: inp_fname2 = os.path.join(tmpdir, 'input2.fasta') seqio.FastaAlignmentWriter.write(self.second_aln, inp_fname2) in_names = seqio.FastaAlignmentWriter.write(to_write, inp_fname) command = self._makeCommand(muscle_exe=muscle_exe, inp_fname=inp_fname, out_fname=out_fname, inp_fname2=inp_fname2) # execute MUSCLE self._run(command) if self.canceled: return None # read the aligned sequences new_alignment = seqio.FastaAlignmentReader.read(out_fname) try: tmpdir_obj.cleanup() except OSError: # Debugging for MSV-3768 print(os.listdir(tmpdir)) if not self.second_aln and self.maintain_order: # Reorder as input # profile mode does not change the sequence order in each input MSA # Note MUSCLE has a built in -stable option which is buggy (and # disabled). See http://www.drive5.com/muscle/manual/stable.html seqio.reorder_fasta_alignment(new_alignment, in_names) if not self.second_aln and empty_seq_indices: # add gap-only sequences back in for i, seq in zip(empty_seq_indices, empty_seqs): # muscle pads sequences with trailing gaps, so we do the same # for the gap-only sequences seq.removeElements(seq) seq.extend(seq.gap_char * new_alignment.num_columns) new_alignment.addSeq(seq, i) return new_alignment
def _makeCommand(self, muscle_exe, inp_fname, out_fname, inp_fname2=None): command = [muscle_exe, '-out', out_fname] if inp_fname2 is None: command.extend(['-in', inp_fname]) else: # If aligning alignments command.extend(['-profile', '-in1', inp_fname, '-in2', inp_fname2]) if self.gapopen: command.extend(['-gapopen', str(-self.gapopen)]) if self.gapext: command.extend(['-gapextend', str(-self.gapext)]) if self.quiet: command.extend(['-quiet']) return command