Source code for FEV_KEGG.KEGG.SSDB

"""
This module represents the model of all intermediate stages of matches acquired via KEGG SSDB database, partly in conjunction with KEGG GENE.

The methods to actually perform the retrieval are not part of this module. See :mod:`FEV_KEGG.KEGG.Database` and :mod:`FEV_KEGG.KEGG.Download` for these.
"""
import jsonpickle
from typing import Iterable, List

from FEV_KEGG.Graph.Elements import GeneID
from FEV_KEGG.Statistics import SequenceComparison


[docs]class PreMatch(object): def __init__(self, foundGeneIdString, swScore, bitScore, identity, overlap): """ A sequence comparison match between two distinct genes, without calculated attributes. The parameters can be retrieved via `KEGG SSDB <http://www.kegg.jp/ssdb-bin/ssdb_ortholog_view?org_gene=syn:sll1450&org=syn>`_ [1]_. Parameters ---------- foundGeneIdString : str ID of the gene found by SSDB to be a paralog/ortholog, e.g. syn:sll1452. swScore : int Smith-Waterman score of the match between the gene which was searched for and the found gene specified by `foundGeneIdString`. bitScore : float Length-normalised `swScore` scaled to bits. identity : float Percentage of equal amino acids, without substitution. overlap : int Number of amino acids the found gene sequence overlaps with the gene which was searched for. Maximum is the length of the searched-for gene. Attributes ---------- self.foundGeneIdString : str self.swScore : int self.bitScore : float self.identity : float self.overlap : int See Also -------- FEV_KEGG.KEGG.Database.getOrthologs : Function to retrieve PreMatches from KEGG SSDB. References __________ .. [1] Sato et al. (2001), "SSDB: Sequence Similarity Database in KEGG", `<https://www.researchgate.net/publication/254718427_SSDB_Sequence_Similarity_Database_in_KEGG>`_ """ self.foundGeneIdString = foundGeneIdString self.swScore = swScore self.bitScore = bitScore self.identity = identity self.overlap = overlap
[docs]class Match(PreMatch): def __init__(self, foundGeneIdString, swScore, bitScore, identity, overlap, length): """ A sequence comparison match between two distinct genes, including calculated attributes. During creation, `foundGeneIdString` is used to create a GeneID object, saved as `foundGeneID` Parameters __________ foundGeneIdString : str ID of the gene found by SSDB to be a paralog/ortholog, e.g. syn:sll1452. swScore : int Smith-Waterman score of the match between the gene which was searched for and the found gene specified by `foundGeneIdString`. bitScore : float Length-normalised `swScore` scaled to bits. identity : float Percentage of equal amino acids, without substitution. overlap : int Number of amino acids the found gene sequence overlaps with the gene which was searched for. Maximum is the length of the searched-for gene. length : int Length in amino acids of the found gene. Derived from downloading the gene's information file. Attributes ---------- self.foundGeneIdString : str self.swScore : int self.bitScore : float self.identity : float self.overlap : int self.length : int self.foundGeneID : :class:`FEV_KEGG.Graph.Elements.GeneID` See Also ________ PreMatch : Handles all other parameters. """ super().__init__(foundGeneIdString, swScore, bitScore, identity, overlap) if length <= 0: raise ValueError("Matched sequence can not be of length <= 0.") self.length = length self.foundGeneID = GeneID(foundGeneIdString)
[docs] @classmethod def fromPreMatch(cls, preMatch: PreMatch, length): """ Cast a PreMatch object to an object of this class. During casting, `foundGeneIdString` is used to create a GeneID object, stored as `foundGeneID`. Also, `save` is stored. Parameters __________ preMatch : PreMatch The object to cast into this class' type. length : int Length in amino acids of the found gene. Derived from downloading the gene's information file. Note ____ This class method simply casts the PreMatch object, instead of going through creating a new Match object. This helps performance and does not significantly impact complexity. """ preMatch.__class__ = cls if length <= 0: raise ValueError("Matched sequence can not be of length <= 0.") preMatch.length = length preMatch.foundGeneID = GeneID(preMatch.foundGeneIdString) return preMatch
[docs]class TransientMatch(Match): def __init__(self, foundGeneIdString, swScore, bitScore, identity, overlap, length, eValue): """ A sequence comparison match between two distinct genes, only valid at a certain point in time. This match is transient, because it is only valid for a certain point in time, because `eValue` changes with the size of the database. Parameters __________ foundGeneIdString : str ID of the gene found by SSDB to be a paralog/ortholog, e.g. syn:sll1452. swScore : int Smith-Waterman score of the match between the gene which was searched for and the found gene specified by `foundGeneIdString`. bitScore : float Length-normalised `swScore` scaled to bits. identity : float Percentage of equal amino acids, without substitution. overlap : int Number of amino acids the found gene sequence overlaps with the gene which was searched for. Maximum is the length of the searched-for gene. length : int Length in amino acids of the found gene. Derived from downloading the gene's information file. eValue : float Statistical expectation value for the chance of yielding a match of the same score by pure randomness alone. Attributes ---------- self.foundGeneIdString : str self.swScore : int self.bitScore : float self.identity : float self.overlap : int self.length : int self.foundGeneID : :class:`FEV_KEGG.Graph.Elements.GeneID` self.eValue : float See Also ________ Match : Handles all other parameters. """ super().__init__(foundGeneIdString, swScore, bitScore, identity, overlap, length) self.eValue = eValue
[docs] @classmethod def fromMatch(cls, match: Match, eValue): """ Cast a Match object to an object of this class. During casting, `eValue` is stored. Note ____ This class method simply casts the Match object, instead of going through creating a new TransientMatch object. This helps performance and does not significantly impact complexity. """ match.__class__ = cls match.eValue = eValue return match
[docs]class JSONpickable(object):
[docs] def __str__(self): """ Encodes object to "unpickable" JSON. Returns ------- str Object in JSON format, including information to "unpickle" it back into an object. """ jsonpickle.set_encoder_options('simplejson', indent=4) return jsonpickle.encode(self)
[docs]class Matching(JSONpickable): def __init__(self, queryGeneID: GeneID, queryLength, databaseOrganism, databaseSize, matches: Iterable[Match], timestamp): """ Result of a search for orthologs or paralogs in SSDB, concerning a single target organism. The E-values for the resulting Matches depend on database size and are therefore only valid at the specified timestamp. Parameters __________ queryGeneID : GeneID ID of the gene to search homologs for, e.g. "syn:sll1450". queryLength : int Length of the gene product in amino acids. databaseOrganism : str Organism to search in to find homologs for `queryGeneID`, e.g. "eco". databaseSize : int Number of genes known to belong to the `databaseOrganism`. This can be queried by `<http://rest.kegg.jp/info/eco>`_, currently yielding "4,498 entries". matches : Iterable[Match] Iterable of Match objects, one for each match found during the matching. timestamp : int When was the query run? As UNIX epoch timestamp in seconds. Attributes ---------- self.queryGeneID : :class:`FEV_KEGG.Graph.Elements.GeneID` self.queryLength : int self.databaseOrganism : str self.databaseSize : int self.timestamp : int self.matches : List[:class:`TransientMatch`] """ self.queryGeneID = queryGeneID self.queryLength = queryLength self.databaseOrganism = databaseOrganism self.databaseSize = databaseSize self.timestamp = timestamp transientMatches = [] for match in matches: eValue = SequenceComparison.getExpectationValue(match.bitScore, queryLength, match.length, databaseSize) transientMatches.append( TransientMatch.fromMatch(match, eValue) ) self.matches = transientMatches
[docs]class MatchingOverview(JSONpickable): def __init__(self, queryGeneID: GeneID, queryLength, bestMatches: Iterable[Match], timestamp): """ Result of a search for orthologs in SSDB, concerning all possible target organisms. Because all possible organisms are searched, only the best matches are returned. If you want all matches, you will have to use :class:`Matching` in a second step. The E-values for the resulting Matches depend on database size and are therefore only valid at the specified timestamp. Parameters __________ queryGeneID : GeneID ID of the gene to search homologs for, e.g. "syn:sll1450". queryLength : int Length of the gene product in amino acids. bestMatches : Iterable[Match] Iterable of best Match objects, one for each orthologous organism found during the matching overview. timestamp : int When was the query run? As UNIX epoch timestamp in seconds. Attributes ---------- self.queryGeneID : :class:`FEV_KEGG.Graph.Elements.GeneID` self.queryLength : int self.timestamp : int self.bestMatches : List[:class:`TransientMatch`] """ self.queryGeneID = queryGeneID self.queryLength = queryLength self.bestMatches = bestMatches self.timestamp = timestamp
[docs] def getTransientMatches(self, relevantOrganisms: Iterable[str]) -> List[TransientMatch]: """ Get full transient matches, considering only relevant orthologous organisms. Considering only relevant organisms is necessary, because a gene can have several thousand orthologs, including ones from organisms completely out of scope, while calculating the E-value for each of those matches is rather slow and involves several downloads. Parameters ---------- relevantOrganisms : Iterable[str] Iterable of organism abbreviations, for each organism to be considered relevant. Returns ------- List[TransientMatch] List of transient matches. These include E-values, which are slow to calculate, which is why only `relevantOrganisms` are considered. This means that only matches found in `self.bestMatches` which come from relevant organisms are actually converted to transient matches. """ from FEV_KEGG.KEGG.Organism import Organism transientMatches = [] # for all Matches for match in self.bestMatches: organismAbbreviation = match.foundGeneID.organismAbbreviation if organismAbbreviation in relevantOrganisms: # Match is relevant # fetch relevant organism's info databaseSize = Organism(organismAbbreviation).getNumberOfGenes() eValue = SequenceComparison.getExpectationValue(match.bitScore, self.queryLength, match.length, databaseSize) # calculate Transient Match transientMatches.append( TransientMatch.fromMatch(match, eValue) ) return transientMatches