"""
This module represents the model of all intermediate stages of matches acquired via KEGG SSDB database, partly in conjunction with KEGG GENE.
The methods to actually perform the retrieval are not part of this module. See :mod:`FEV_KEGG.KEGG.Database` and :mod:`FEV_KEGG.KEGG.Download` for these.
"""
import jsonpickle
from typing import Iterable, List
from FEV_KEGG.Graph.Elements import GeneID
from FEV_KEGG.Statistics import SequenceComparison
[docs]class PreMatch(object):
def __init__(self, foundGeneIdString, swScore, bitScore, identity, overlap):
"""
A sequence comparison match between two distinct genes, without calculated attributes.
The parameters can be retrieved via `KEGG SSDB <http://www.kegg.jp/ssdb-bin/ssdb_ortholog_view?org_gene=syn:sll1450&org=syn>`_ [1]_.
Parameters
----------
foundGeneIdString : str
ID of the gene found by SSDB to be a paralog/ortholog, e.g. syn:sll1452.
swScore : int
Smith-Waterman score of the match between the gene which was searched for and the found gene specified by `foundGeneIdString`.
bitScore : float
Length-normalised `swScore` scaled to bits.
identity : float
Percentage of equal amino acids, without substitution.
overlap : int
Number of amino acids the found gene sequence overlaps with the gene which was searched for. Maximum is the length of the searched-for gene.
Attributes
----------
self.foundGeneIdString : str
self.swScore : int
self.bitScore : float
self.identity : float
self.overlap : int
See Also
--------
FEV_KEGG.KEGG.Database.getOrthologs : Function to retrieve PreMatches from KEGG SSDB.
References
__________
.. [1] Sato et al. (2001), "SSDB: Sequence Similarity Database in KEGG", `<https://www.researchgate.net/publication/254718427_SSDB_Sequence_Similarity_Database_in_KEGG>`_
"""
self.foundGeneIdString = foundGeneIdString
self.swScore = swScore
self.bitScore = bitScore
self.identity = identity
self.overlap = overlap
[docs]class Match(PreMatch):
def __init__(self, foundGeneIdString, swScore, bitScore, identity, overlap, length):
"""
A sequence comparison match between two distinct genes, including calculated attributes.
During creation, `foundGeneIdString` is used to create a GeneID object, saved as `foundGeneID`
Parameters
__________
foundGeneIdString : str
ID of the gene found by SSDB to be a paralog/ortholog, e.g. syn:sll1452.
swScore : int
Smith-Waterman score of the match between the gene which was searched for and the found gene specified by `foundGeneIdString`.
bitScore : float
Length-normalised `swScore` scaled to bits.
identity : float
Percentage of equal amino acids, without substitution.
overlap : int
Number of amino acids the found gene sequence overlaps with the gene which was searched for. Maximum is the length of the searched-for gene.
length : int
Length in amino acids of the found gene. Derived from downloading the gene's information file.
Attributes
----------
self.foundGeneIdString : str
self.swScore : int
self.bitScore : float
self.identity : float
self.overlap : int
self.length : int
self.foundGeneID : :class:`FEV_KEGG.Graph.Elements.GeneID`
See Also
________
PreMatch : Handles all other parameters.
"""
super().__init__(foundGeneIdString, swScore, bitScore, identity, overlap)
if length <= 0:
raise ValueError("Matched sequence can not be of length <= 0.")
self.length = length
self.foundGeneID = GeneID(foundGeneIdString)
[docs] @classmethod
def fromPreMatch(cls, preMatch: PreMatch, length):
"""
Cast a PreMatch object to an object of this class.
During casting, `foundGeneIdString` is used to create a GeneID object, stored as `foundGeneID`. Also, `save` is stored.
Parameters
__________
preMatch : PreMatch
The object to cast into this class' type.
length : int
Length in amino acids of the found gene. Derived from downloading the gene's information file.
Note
____
This class method simply casts the PreMatch object, instead of going through creating a new Match object.
This helps performance and does not significantly impact complexity.
"""
preMatch.__class__ = cls
if length <= 0:
raise ValueError("Matched sequence can not be of length <= 0.")
preMatch.length = length
preMatch.foundGeneID = GeneID(preMatch.foundGeneIdString)
return preMatch
[docs]class TransientMatch(Match):
def __init__(self, foundGeneIdString, swScore, bitScore, identity, overlap, length, eValue):
"""
A sequence comparison match between two distinct genes, only valid at a certain point in time.
This match is transient, because it is only valid for a certain point in time, because `eValue` changes with the size of the database.
Parameters
__________
foundGeneIdString : str
ID of the gene found by SSDB to be a paralog/ortholog, e.g. syn:sll1452.
swScore : int
Smith-Waterman score of the match between the gene which was searched for and the found gene specified by `foundGeneIdString`.
bitScore : float
Length-normalised `swScore` scaled to bits.
identity : float
Percentage of equal amino acids, without substitution.
overlap : int
Number of amino acids the found gene sequence overlaps with the gene which was searched for. Maximum is the length of the searched-for gene.
length : int
Length in amino acids of the found gene. Derived from downloading the gene's information file.
eValue : float
Statistical expectation value for the chance of yielding a match of the same score by pure randomness alone.
Attributes
----------
self.foundGeneIdString : str
self.swScore : int
self.bitScore : float
self.identity : float
self.overlap : int
self.length : int
self.foundGeneID : :class:`FEV_KEGG.Graph.Elements.GeneID`
self.eValue : float
See Also
________
Match : Handles all other parameters.
"""
super().__init__(foundGeneIdString, swScore, bitScore, identity, overlap, length)
self.eValue = eValue
[docs] @classmethod
def fromMatch(cls, match: Match, eValue):
"""
Cast a Match object to an object of this class.
During casting, `eValue` is stored.
Note
____
This class method simply casts the Match object, instead of going through creating a new TransientMatch object.
This helps performance and does not significantly impact complexity.
"""
match.__class__ = cls
match.eValue = eValue
return match
[docs]class JSONpickable(object):
[docs] def __str__(self):
"""
Encodes object to "unpickable" JSON.
Returns
-------
str
Object in JSON format, including information to "unpickle" it back into an object.
"""
jsonpickle.set_encoder_options('simplejson', indent=4)
return jsonpickle.encode(self)
[docs]class Matching(JSONpickable):
def __init__(self, queryGeneID: GeneID, queryLength, databaseOrganism, databaseSize, matches: Iterable[Match], timestamp):
"""
Result of a search for orthologs or paralogs in SSDB, concerning a single target organism.
The E-values for the resulting Matches depend on database size and are therefore only valid at the specified timestamp.
Parameters
__________
queryGeneID : GeneID
ID of the gene to search homologs for, e.g. "syn:sll1450".
queryLength : int
Length of the gene product in amino acids.
databaseOrganism : str
Organism to search in to find homologs for `queryGeneID`, e.g. "eco".
databaseSize : int
Number of genes known to belong to the `databaseOrganism`. This can be queried by `<http://rest.kegg.jp/info/eco>`_, currently yielding "4,498 entries".
matches : Iterable[Match]
Iterable of Match objects, one for each match found during the matching.
timestamp : int
When was the query run? As UNIX epoch timestamp in seconds.
Attributes
----------
self.queryGeneID : :class:`FEV_KEGG.Graph.Elements.GeneID`
self.queryLength : int
self.databaseOrganism : str
self.databaseSize : int
self.timestamp : int
self.matches : List[:class:`TransientMatch`]
"""
self.queryGeneID = queryGeneID
self.queryLength = queryLength
self.databaseOrganism = databaseOrganism
self.databaseSize = databaseSize
self.timestamp = timestamp
transientMatches = []
for match in matches:
eValue = SequenceComparison.getExpectationValue(match.bitScore, queryLength, match.length, databaseSize)
transientMatches.append( TransientMatch.fromMatch(match, eValue) )
self.matches = transientMatches
[docs]class MatchingOverview(JSONpickable):
def __init__(self, queryGeneID: GeneID, queryLength, bestMatches: Iterable[Match], timestamp):
"""
Result of a search for orthologs in SSDB, concerning all possible target organisms.
Because all possible organisms are searched, only the best matches are returned. If you want all matches, you will have to use :class:`Matching` in a second step.
The E-values for the resulting Matches depend on database size and are therefore only valid at the specified timestamp.
Parameters
__________
queryGeneID : GeneID
ID of the gene to search homologs for, e.g. "syn:sll1450".
queryLength : int
Length of the gene product in amino acids.
bestMatches : Iterable[Match]
Iterable of best Match objects, one for each orthologous organism found during the matching overview.
timestamp : int
When was the query run? As UNIX epoch timestamp in seconds.
Attributes
----------
self.queryGeneID : :class:`FEV_KEGG.Graph.Elements.GeneID`
self.queryLength : int
self.timestamp : int
self.bestMatches : List[:class:`TransientMatch`]
"""
self.queryGeneID = queryGeneID
self.queryLength = queryLength
self.bestMatches = bestMatches
self.timestamp = timestamp
[docs] def getTransientMatches(self, relevantOrganisms: Iterable[str]) -> List[TransientMatch]:
"""
Get full transient matches, considering only relevant orthologous organisms.
Considering only relevant organisms is necessary, because a gene can have several thousand orthologs, including ones from organisms completely out of scope, while calculating the E-value for each of those matches is rather slow and involves several downloads.
Parameters
----------
relevantOrganisms : Iterable[str]
Iterable of organism abbreviations, for each organism to be considered relevant.
Returns
-------
List[TransientMatch]
List of transient matches. These include E-values, which are slow to calculate, which is why only `relevantOrganisms` are considered.
This means that only matches found in `self.bestMatches` which come from relevant organisms are actually converted to transient matches.
"""
from FEV_KEGG.KEGG.Organism import Organism
transientMatches = []
# for all Matches
for match in self.bestMatches:
organismAbbreviation = match.foundGeneID.organismAbbreviation
if organismAbbreviation in relevantOrganisms: # Match is relevant
# fetch relevant organism's info
databaseSize = Organism(organismAbbreviation).getNumberOfGenes()
eValue = SequenceComparison.getExpectationValue(match.bitScore, self.queryLength, match.length, databaseSize)
# calculate Transient Match
transientMatches.append( TransientMatch.fromMatch(match, eValue) )
return transientMatches