Source code for schrodinger.protein.gpcr.tasks

import copy
from typing import Tuple

import numpy as np

from schrodinger.application.msv.gui import gui_alignment
from schrodinger.models import parameters
from schrodinger.protein import align
from schrodinger.protein import sequence
from schrodinger.protein.gpcr import annotate
from schrodinger.protein.gpcr import sql
from schrodinger.protein.tasks import blast
from schrodinger.tasks import tasks


[docs]class GPCRTask(tasks.ComboSubprocessTask): """ Task to run 'protein.tasks.blast.BlastTask' task against a custom database. """ output: gui_alignment.GuiProteinAlignment
[docs] class Input(parameters.CompoundParam): query_sequence: sequence.ProteinSequence = None custom_database_path: str sqlite_path: str = None
[docs] def mainFunction(self): blast_task = blast.BlastTask() inp = blast_task.input inp.query_sequence = self.input.query_sequence inp.settings.location = blast.LOCAL inp.settings.custom_database_path = self.input.custom_database_path inp.settings.database_name = blast.BlastDatabase.CUSTOM inp.settings.num_alignments = 100 blast_task.start() blast_task.wait() self.output = self._getBestAlignment(blast_task.output) self.transferrAnnotationToQuerySeq()
def _getBestAlignment(self, hits) -> str: """ Get the alignment that consists of query sequence (as ref seq) and the sequence that has highest identity with the reference sequence. Raises ValueError when there is no sequence with 90% or more identity. """ aligner = align.BiopythonPairwiseAligner(penalize_end_gaps=True) best_identity = 0.9 best_aln = None best_alignment_score = 0.0 for hit in hits: seq = sequence.ProteinSequence(elements=hit['sequence'], name=hit['name']) aln, identity, alignment_score = self._getHitScore(aligner, seq) if identity > best_identity: best_aln = aln elif np.isclose(identity, best_identity): if alignment_score > best_alignment_score: best_aln = aln if best_aln is not None: return best_aln raise ValueError("Could not find sequence with identity > 90% ") def _getHitScore( self, aligner: align.BiopythonPairwiseAligner, seq: sequence.ProteinSequence ) -> Tuple[gui_alignment.GuiProteinAlignment, float, float]: """ Align the given sequence to the reference sequence using the aligner and find out the identity and alignment score. A copy of the reference seq is used for the alignment to avoid preserving gap from the previous alignment. """ ref_seq = copy.deepcopy(self.input.query_sequence) aln = gui_alignment.GuiProteinAlignment([ref_seq, seq]) aligner.run(aln) alignment_score = aligner.getAlignmentScore() identity = ref_seq.getIdentity(seq) return aln, identity, alignment_score
[docs] def transferrAnnotationToQuerySeq(self): """ Transfer the GPCR annotation from the blast hit to the query sequence. """ gpcr_anno = self._getGPCRAnnotations() gpcr_anno_map = self.getGPCRAnnotationMap(gpcr_anno) if gpcr_anno_map: self.input.query_sequence.setGPCRAnnotations(gpcr_anno_map)
def _getGPCRAnnotations(self): """ Get the gpcr annotation of the blast result sequence from the sqlite database. """ out_put_seq = self.output[1] entry_name = out_put_seq.name.strip('_G') if self.input.sqlite_path is None: return [] conn = sql.open_database(filename=self.input.sqlite_path) gpcr_anno = annotate.get_residue_annotations(entry_name, conn=conn) return gpcr_anno
[docs] def getGPCRAnnotationMap(self, gpcr_anno): """ Generate a map of residues in input sequence to the corresponding gpcr annotation that is transferred from the blast hit. """ # Since we make copy of the input.query_sequence before alignment, the # aligned query sequence may have gaps at different indices than in the # input.query_sequence. original_seq = self.input.query_sequence gpcr_annotation_map = dict() query_seq = self.output[0] hit_seq = self.output[1] orginal_sequence_map = { res.gapless_idx_in_seq: res for res in original_seq.residues() } for res in hit_seq.residues(): query_seq_res = query_seq[res.idx_in_seq] if query_seq_res.is_gap: continue orig_res = orginal_sequence_map[query_seq_res.gapless_idx_in_seq] annotation = gpcr_anno[res.gapless_idx_in_seq] gpcr_annotation_map[orig_res] = annotation return gpcr_annotation_map