Source code for FEV_KEGG.Evolution.Events

from typing import Set, Dict, Tuple

from FEV_KEGG.Graph.Elements import Enzyme, EcNumber
from FEV_KEGG.Graph.SubstanceGraphs import SubstanceEnzymeGraph, SubstanceEcGraph
from FEV_KEGG.KEGG import Database
from builtins import set
from FEV_KEGG.KEGG.Organism import Group
from _collections_abc import Iterable
from FEV_KEGG import settings
import itertools
from FEV_KEGG.Drawing import Export

defaultEValue = settings.defaultEvalue
"""
Default threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
This can be overridden in each relevant method's `eValue` parameter in this module.
"""


[docs]class GeneFunctionConservation(object): """ Evolutionary event of conserving a gene function (EC number) between a pair of arbitrary ancestor and descendant. The conditions for a gene function conservation are: - The EC number has been conserved, along the way from an older group of organisms to a newer one. """
[docs] @staticmethod def getECs(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> Set[EcNumber]: """ Get EC numbers which have been conserved between ancestor and descendant, existing in both. Parameters ---------- ancestorEcGraph : SubstanceEcGraph descendantEcGraph : SubstanceEcGraph Returns ------- Set[EcNumber] Set of EC numbers which occur in the ancestor's EC graph and in the decendants's, i.e. EC numbers which are conserved in the descendant. """ conservedECs = ancestorEcGraph.getECs() conservedECs.intersection_update(descendantEcGraph.getECs()) return conservedECs
[docs] @staticmethod def getGraph(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> SubstanceEcGraph: """ Get graph containing EC numbers which have been conserved between ancestor and descendant, existing in both. Parameters ---------- ancestorEcGraph : SubstanceEcGraph descendantEcGraph : SubstanceEcGraph Returns ------- SubstanceEcGraph Graph of EC numbers which occur in the ancestor's EC graph, and in the decendants's, i.e. EC numbers which are conserved in the descendant. Substance-EC-product edges are only included if both graphs, ancestor and descendant, have both nodes, substrate and product. """ conservedGraph = ancestorEcGraph.intersection(descendantEcGraph, addCount=False) conservedGraph.removeIsolatedNodes() return conservedGraph
[docs]class GeneFunctionAddition(object): """ Evolutionary event of adding a gene function (EC number) between a pair of arbitrary ancestor and descendant. The conditions for a gene function addition are: - The EC number has been added, from an unknown origin, along the way from an older group of organisms to a newer one. """
[docs] @staticmethod def getECs(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> Set[EcNumber]: """ Get EC numbers which have been added between ancestor and descendant, existing only in the descendant. Parameters ---------- ancestorEcGraph : SubstanceEcGraph descendantEcGraph : SubstanceEcGraph Returns ------- Set[EcNumber] Set of EC numbers which occur in the descendant's EC graph, but not in the ancestor's, i.e. EC numbers which are new to the descendant. """ addedECs = descendantEcGraph.getECs() addedECs.difference_update(ancestorEcGraph.getECs()) return addedECs
[docs] @staticmethod def getGraph(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> SubstanceEcGraph: """ Get graph containing EC numbers which have been added between ancestor and descendant, existing only in the descendant. Parameters ---------- ancestorEcGraph : SubstanceEcGraph descendantEcGraph : SubstanceEcGraph Returns ------- SubstanceEcGraph Graph of EC numbers which occur in the descendant's EC graph, but not in the ancestor's, i.e. EC numbers which are new in the descendant. Substance-EC-product edges are only included if both graphs, ancestor and descendant, have both nodes, substrate and product. """ addedGraph = descendantEcGraph.difference(ancestorEcGraph, subtractNodes=False) addedGraph.removeIsolatedNodes() return addedGraph
[docs]class GeneFunctionLoss(object): """ Evolutionary event of losing a gene function (EC number) between a pair of arbitrary ancestor and descendant. The conditions for a gene function loss are: - The EC number has been lost, along the way from an older group of organisms to a newer one. """
[docs] @staticmethod def getECs(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> Set[EcNumber]: """ Get EC numbers which have been lost between ancestor and descendant, existing only in the ancestor. Parameters ---------- ancestorEcGraph : SubstanceEcGraph descendantEcGraph : SubstanceEcGraph Returns ------- Set[EcNumber] Set of EC numbers which occur in the ancestor's EC graph, but not in the decendants's, i.e. EC numbers which are lost to the descendant. """ lostECs = ancestorEcGraph.getECs() lostECs.difference_update(descendantEcGraph.getECs()) return lostECs
[docs] @staticmethod def getGraph(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> SubstanceEcGraph: """ Get graph containing EC numbers which have been lost between ancestor and descendant, existing only in the ancestor. Parameters ---------- ancestorEcGraph : SubstanceEcGraph descendantEcGraph : SubstanceEcGraph Returns ------- SubstanceEcGraph Graph of EC numbers which occur in the ancestor's EC graph, but not in the decendants's, i.e. EC numbers which are lost in the descendant. Substance-EC-product edges are only included if both graphs, ancestor and descendant, have both nodes, substrate and product. """ lostGraph = ancestorEcGraph.difference(descendantEcGraph, subtractNodes=False) lostGraph.removeIsolatedNodes() return lostGraph
[docs]class GeneFunctionDivergence(object): """ Evolutionary event of diverging (adding or losing) a gene function (EC number) between a pair of arbitrary ancestor and descendant. The conditions for a gene function divergence are: - The EC number exists in an older group of organisms, but not in a newer one, or the other way around. """
[docs] @staticmethod def getECs(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> Set[EcNumber]: """ Get EC numbers which have diverged between ancestor and descendant, existing only in either one of them. Obviously, `ancestorEcGraph` and `descendantEcGraph` can be swapped here without changing the result. Parameters ---------- ancestorEcGraph : SubstanceEcGraph descendantEcGraph : SubstanceEcGraph Returns ------- Set[EcNumber] Set of EC numbers which occur in the ancestor's EC graph, but not in the decendants's and vice versa, i.e. EC numbers which only exist in either one of the organism groups. """ divergedECs = ancestorEcGraph.getECs() divergedECs.symmetric_difference(descendantEcGraph.getECs()) return divergedECs
[docs] @staticmethod def getGraph(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> SubstanceEcGraph: """ Get graph containing EC numbers which have diverged between ancestor and descendant, existing only in either one of them. Parameters ---------- ancestorEcGraph : SubstanceEcGraph descendantEcGraph : SubstanceEcGraph Returns ------- SubstanceEcGraph Graph of EC numbers which occur in the ancestor's EC graph, but not in the decendants's and vice versa, i.e. EC numbers which only exist in either one of the organism groups. Substance-EC-product edges are only included if both graphs, ancestor and descendant, have both nodes, substrate and product. """ addedGraph = GeneFunctionAddition.getGraph(ancestorEcGraph, descendantEcGraph) lostGraph = GeneFunctionLoss.getGraph(ancestorEcGraph, descendantEcGraph) divergedGraph = addedGraph.union(lostGraph, addCount=False) return divergedGraph
[docs]class GeneDuplication(object): """ Abstract class for any type of gene duplication. """
[docs]class SimpleGroupGeneDuplication(GeneDuplication): """ Evolutionary event of duplicating a gene, regardless of ancestoral bonds in the comparison group. The conditions for a 'simple group' gene duplication are: - The gene has at least one homolog within the set of organisms its organism belongs to. """ def __init__(self, sameGroupOrganisms: 'Iterable[Organism] or KEGG.Organism.Group'): """ Simple group gene duplication extends simple gene duplication by expanding the term 'paralog' to every organism in the set of organisms the gene's organism blongs to. In contrast to :class:`SimpleGeneDuplication`, this class has to be instantiated, using the aforementioned set of organisms belonging to each other. This would usually be a :class:`FEV_KEGG.KEGG.Organism.Group` of the same :class:`FEV_KEGG.Evolution.Clade.Clade`. Parameters ---------- sameGroupOrganisms : Iterable[Organism] or Organism.Group Organisms which will be searched for the occurence of homologs, i.e. are considered "semi-paralogously" related. Attributes ---------- self.sameGroupOrganisms : Iterable[Organism] Raises ------ ValueError If `sameGroupOrganisms` is of wrong type. Warnings -------- This takes much longer than :class:`SimpleGeneDuplication`, because each sought gene is compared between all organisms of the same group, not only within its own organism. """ if isinstance(sameGroupOrganisms, Group): self.sameGroupOrganisms = sameGroupOrganisms.organisms elif isinstance(sameGroupOrganisms, Iterable): self.sameGroupOrganisms = sameGroupOrganisms else: raise ValueError("'sameGroupOrganisms' must be of type Iterable or KEGG.Organism.Group")
[docs] def getEnzymes(self, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, returnMatches = False, ignoreDuplicatesOutsideSet = None): """ Get gene-duplicated enzymes. Parameters ---------- enzymes : Set[Enzyme] or SubstanceEnzymeGraph Set of enzymes to be checked for gene duplication, or a graph. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. returnMatches : bool, optional If *True*, return not only `enzymes` that have homologs, but also which homologs they have. Useful for filtering for relevant homologs afterwards. ignoreDuplicatesOutsideSet : Set[GeneID], optional If *None*, report all found duplicates. If not *None*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set. This can, for example, serve to exclude duplicates in secondary metabolism. Returns ------- Set[Enzyme] or Dict[Element, Set[GeneID]] If returnMatches == *False*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition. If returnMatches == *True*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition, pointing to a set of gene IDs of the found homologs. Raises ------ ValueError If any organism does not exist. HTTPError If any gene does not exist. URLError If connection to KEGG fails. """ if isinstance(enzymes, SubstanceEnzymeGraph): enzymes = enzymes.getEnzymes() # find gene duplicated enzymes according to simple gene duplication shallReturnMatches = returnMatches or (ignoreDuplicatesOutsideSet is not None) simpleGeneDuplicates = SimpleGeneDuplication.getEnzymes(enzymes, eValue, returnMatches = shallReturnMatches, ignoreDuplicatesOutsideSet = ignoreDuplicatesOutsideSet) if returnMatches or ignoreDuplicatesOutsideSet is not None: simpleGeneDuplicatesSet = simpleGeneDuplicates.keys() else: simpleGeneDuplicatesSet = simpleGeneDuplicates duplicatedEnzymes = dict() # those enzymes already duplicated according to simple gene duplication do not have to be tested again soughtEnzymes = enzymes.copy() soughtEnzymes.difference_update( simpleGeneDuplicatesSet ) # get orthologs geneIDs = [enzyme.geneID for enzyme in soughtEnzymes] if returnMatches or ignoreDuplicatesOutsideSet is not None: # need to get all orthologs matchingsDict = Database.getOrthologsBulk(geneIDs, self.sameGroupOrganisms, eValue) # GeneID -> List[ Matching ] for enzyme in enzymes: # for all enzymes, even the ones already found as paralogs # add paralogs paralogousGeneIDs = simpleGeneDuplicates.get(enzyme.geneID) if paralogousGeneIDs is not None: if len(paralogousGeneIDs) > 0: duplicatedEnzymes[enzyme] = paralogousGeneIDs continue # can not be in orthologs if it was already paralogous # add orthologs orthologousMatchings = matchingsDict.get(enzyme.geneID) if orthologousMatchings is not None: if len(orthologousMatchings) > 0: matches = [] for matching in orthologousMatchings: matches.extend(matching.matches) matchedGeneIDs = {match.foundGeneID for match in matches} if len(matchedGeneIDs) > 0: currentDuplicates = duplicatedEnzymes.get(enzyme) if currentDuplicates is None: currentDuplicates = set() duplicatedEnzymes[enzyme] = currentDuplicates currentDuplicates.update(matchedGeneIDs) else: # only interesting IF there are orthologs orthologousOrganismsDict = Database.hasOrthologsBulk(geneIDs, self.sameGroupOrganisms, eValue) # GeneID -> List[ organismAbbreviation ] # regarding each enzyme sought for enzyme in soughtEnzymes: # count orthologous GeneIDs orthologousOrganisms = orthologousOrganismsDict.get(enzyme.geneID) if orthologousOrganisms is None: continue if len(orthologousOrganisms) > 0: # has at least one ortholog in any other organism duplicatedEnzymes[enzyme] = None #orthologousOrganisms if ignoreDuplicatesOutsideSet is not None: filteredDuplicatedEnzymes = dict() for enzyme, matchGeneIDs in duplicatedEnzymes.items(): matchesInSearchedSet = ignoreDuplicatesOutsideSet.intersection( matchGeneIDs ) if len(matchesInSearchedSet) > 0: # some of the matches are in the set of enzymes to be checked for duplicates filteredDuplicatedEnzymes[enzyme] = matchesInSearchedSet duplicatedEnzymes = filteredDuplicatedEnzymes if returnMatches: return duplicatedEnzymes else: return set(duplicatedEnzymes.keys())
[docs] def getEnzymePairs(self, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, ignoreDuplicatesOutsideSet = None, geneIdToEnzyme = None) -> Set[Tuple[Enzyme, Enzyme]]: """ Get gene-duplicated enzymes, in pairs of duplicates. If enzymeA is a duplicate of enzymeB and vice versa, this returns symmetric duplicates of the form (enzymeA, enzymeB) and (enzymeB, enzymeA). Parameters ---------- enzymes : Set[Enzyme] or SubstanceEnzymeGraph Set of enzymes to be checked for gene duplication, or a graph. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. ignoreDuplicatesOutsideSet : Set[GeneID], optional If *None*, report all found duplicates. If not *None*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set. This can, for example, serve to exclude duplicates in secondary metabolism. geneIdToEnzyme : Dict[GeneID, Enzyme], optional Dictionary for mapping each gene ID of every found duplicate to an enzyme object. If *None*, gets the enzyme from the database. This avoids the KeyError, but can cause a lot of network load. Returns ------- Set[Tuple[Enzyme, Enzyme]] Set of pairs of gene-duplicated enzymes, realised as tuples. The order is arbitrary and there will almost certainly be 100% duplicates. Raises ------ KeyError If `geneIdToEnzyme` is passed, but does not contain the gene ID of every duplicate. """ duplicatedEnzymeMatches = self.getEnzymes(enzymes, eValue, returnMatches = True, ignoreDuplicatesOutsideSet = ignoreDuplicatesOutsideSet) # expand matches of homologous gene IDs to pairs of duplicated enzymes. Which we can do (without wasting further resources) only here, because only here we have the geneID -> enzyme dict! duplicatedEnzymePairs = set() if geneIdToEnzyme is None: # need to get enzyme objects from database allGeneIDs = set() for geneIDs in duplicatedEnzymeMatches.values(): allGeneIDs.update( geneIDs ) geneIdToEnzyme = dict() for geneID, gene in Database.getGeneBulk(allGeneIDs).items(): enzyme = Enzyme.fromGene(gene) geneIdToEnzyme[geneID] = enzyme for enzymeA, geneIDs in duplicatedEnzymeMatches.items(): for geneID in geneIDs: enzymeB = geneIdToEnzyme[geneID] duplicatedEnzymePairs.add( (enzymeA, enzymeB) ) return duplicatedEnzymePairs
[docs] def filterEnzymes(self, substanceEnzymeGraph: SubstanceEnzymeGraph, eValue = defaultEValue) -> SubstanceEnzymeGraph: """ Remove all enzymes from a graph which have not been gene-duplicated. Parameters ---------- substanceEnzymeGraph : SubstanceEnzymeGraph Graph of enzymes to be checked for gene duplication. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. Returns ------- SubstanceEnzymeGraph A copy of the `substanceEnzymeGraph` containing only enzymes which fulfil the conditions of this gene duplication definition. Raises ------ ValueError If any organism does not exist. HTTPError If any gene does not exist. URLError If connection to KEGG fails. """ graph = substanceEnzymeGraph.copy() possibleGeneDuplicates = self.getEnzymes(substanceEnzymeGraph, eValue) graph.removeAllEnzymesExcept( possibleGeneDuplicates ) return graph
[docs]class SimpleGeneDuplication(GeneDuplication): """ Evolutionary event of duplicating a gene, regardless of ancestoral bonds. The conditions for a 'simple' gene duplication are: - The gene has at least one paralog. """
[docs] @staticmethod def getEnzymes(enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, returnMatches = False, ignoreDuplicatesOutsideSet = None, preCalculatedEnzymes = None): """ Get gene-duplicated enzymes. Parameters ---------- enzymes : Set[Enzyme] or SubstanceEnzymeGraph Set of enzymes to be checked for gene duplication, or a graph. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. returnMatches : bool, optional If *True*, return not only `enzymes` that have homologs, but also which homologs they have. Useful for filtering for relevant homologs afterwards. ignoreDuplicatesOutsideSet : Set[GeneID] or *True*, optional If *None*, report all found duplicates. If *True*, automatically restrict to all enzymes in `substanceEnzymeGraph`. If a set, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set. Beware, the set has to contain the enzymes' gene ID! This can, for example, serve to exclude duplicates in secondary metabolism. Returns ------- Set[Enzyme] or Dict[Element, Set[GeneID]] If returnMatches == *False*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition. If returnMatches == *True*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition, pointing to a set of gene IDs of the found homologs. Raises ------ ValueError If any organism does not exist. HTTPError If any gene does not exist. URLError If connection to KEGG fails. """ if isinstance(enzymes, SubstanceEnzymeGraph): enzymes = enzymes.getEnzymes() if preCalculatedEnzymes is not None: if returnMatches is True: result = dict() else: result = set() # filter preCalculatedEnzymes by enzymes for enzyme, duplicatesSet in preCalculatedEnzymes.items(): if enzyme in enzymes: validDuplicates = set() for duplicate in duplicatesSet: if ignoreDuplicatesOutsideSet is None or duplicate.geneID in ignoreDuplicatesOutsideSet: validDuplicates.add( duplicate.geneID ) if len(validDuplicates) > 0: if returnMatches is True: result[enzyme] = validDuplicates else: result.add( enzyme ) return result possibleGeneDuplicates = dict() geneIDs = {enzyme.geneID for enzyme in enzymes} matchingsDict = Database.getParalogsBulk(geneIDs, eValue) for enzyme in enzymes: matching = matchingsDict.get(enzyme.geneID, None) if matching is None: print( 'WARNING: data for GeneID ' + str(enzyme.geneID) + ' could not be downloaded. Maybe you want to exclude this erroneous organism completely, before it skews statistics? See quirks.py') continue paralogs = matching.matches if len(paralogs) > 0: possibleGeneDuplicates[enzyme] = [match.foundGeneID for match in paralogs] if ignoreDuplicatesOutsideSet is True: ignoreDuplicatesOutsideSet = geneIDs if ignoreDuplicatesOutsideSet is not None and len(ignoreDuplicatesOutsideSet) > 0: filteredPossibleGeneDuplicates = dict() for enzyme, matchGeneIDs in possibleGeneDuplicates.items(): # which genes have been found AND are in the set of relevant duplicates? matchesInSearchedSet = ignoreDuplicatesOutsideSet.intersection( matchGeneIDs ) if len(matchesInSearchedSet) > 0: # some of the matches are in the set of relevant duplicates filteredPossibleGeneDuplicates[enzyme] = matchesInSearchedSet possibleGeneDuplicates = filteredPossibleGeneDuplicates if returnMatches: return possibleGeneDuplicates else: return set(possibleGeneDuplicates.keys())
[docs] @classmethod def getEnzymePairs(cls, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, ignoreDuplicatesOutsideSet = None, geneIdToEnzyme = None, preCalculatedEnzymes = None) -> Set[Tuple[Enzyme, Enzyme]]: """ Get gene-duplicated enzymes, in pairs of duplicates. If enzyme A is a duplicate of enzyme B and vice versa, this does not return duplicates, but returns only one pair, with the "smaller" enzyme as the first value. An enzyme is "smaller" if its gene ID string is "smaller". Parameters ---------- enzymes : Set[Enzyme] or SubstanceEnzymeGraph Set of enzymes to be checked for gene duplication, or a graph. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. ignoreDuplicatesOutsideSet : Set[GeneID] or *True*, optional If *None*, report all found duplicates. If *True*, automatically restrict to all enzymes in `substanceEnzymeGraph`. If a set, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set. Beware, the set has to contain the enzymes' gene ID! This can, for example, serve to exclude duplicates in secondary metabolism. geneIdToEnzyme : Dict[GeneID, Enzyme], optional Dictionary for mapping each gene ID of every found duplicate to an enzyme object. If *None*, gets the enzyme from the database. This avoids the KeyError, but can cause a lot of network load. Returns ------- Set[Tuple[Enzyme, Enzyme]] Set of pairs of gene-duplicated enzymes, realised as tuples. The order is arbitrary. Raises ------ KeyError If `geneIdToEnzyme` is passed, but does not contain the gene ID of every duplicate. """ duplicatedEnzymeMatches = cls.getEnzymes(enzymes, eValue, returnMatches = True, ignoreDuplicatesOutsideSet = ignoreDuplicatesOutsideSet, preCalculatedEnzymes = preCalculatedEnzymes) # expand matches of homologous gene IDs to pairs of duplicated enzymes. Which we can do (without wasting further resources) only here, because only here we have the geneID -> enzyme dict! duplicatedEnzymePairs = set() if geneIdToEnzyme is None: # need to get enzyme objects from database allGeneIDs = set() for geneIDs in duplicatedEnzymeMatches.values(): allGeneIDs.update( geneIDs ) geneIdToEnzyme = dict() for geneID, gene in Database.getGeneBulk(allGeneIDs).items(): enzyme = Enzyme.fromGene(gene) geneIdToEnzyme[geneID] = enzyme for enzymeA, geneIDs in duplicatedEnzymeMatches.items(): for geneID in geneIDs: enzymeB = geneIdToEnzyme[geneID] duplicatedEnzymePairs.add( (enzymeA, enzymeB) ) # filter symmetric duplicates deduplicatedEnzymePairs = set() for enzymeA, enzymeB in duplicatedEnzymePairs: if enzymeA <= enzymeB: deduplicatedEnzymePairs.add( (enzymeA, enzymeB) ) else: deduplicatedEnzymePairs.add( (enzymeB, enzymeA) ) return deduplicatedEnzymePairs
[docs] @classmethod def filterEnzymes(cls, substanceEnzymeGraph: SubstanceEnzymeGraph, eValue = defaultEValue, ignoreDuplicatesOutsideSet = None, preCalculatedEnzymes = None) -> SubstanceEnzymeGraph: """ Remove all enzymes from a graph which have not been gene-duplicated. Parameters ---------- substanceEnzymeGraph : SubstanceEnzymeGraph Graph of enzymes to be checked for gene duplication. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. ignoreDuplicatesOutsideSet : Set[GeneID] or *True*, optional If *None*, report all found duplicates. If *True*, automatically restrict to all enzymes in `substanceEnzymeGraph`. If a set, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set. Beware, the set has to contain the enzymes' gene ID! This can, for example, serve to exclude duplicates in secondary metabolism. Returns ------- SubstanceEnzymeGraph A copy of the `substanceEnzymeGraph` containing only enzymes which fulfil the conditions of this gene duplication definition. Raises ------ ValueError If any organism does not exist. HTTPError If any gene does not exist. URLError If connection to KEGG fails. """ graph = substanceEnzymeGraph.copy() possibleGeneDuplicates = cls.getEnzymes(substanceEnzymeGraph, eValue, returnMatches = False, ignoreDuplicatesOutsideSet = ignoreDuplicatesOutsideSet, preCalculatedEnzymes = preCalculatedEnzymes) graph.removeAllEnzymesExcept( possibleGeneDuplicates ) return graph
[docs]class ChevronGeneDuplication(GeneDuplication): """ Evolutionary event of duplicating a gene, in dependence of a certain ancestoral bond. The conditions for a 'chevron' gene duplication are: - The gene has at least one paralog. - The gene has at least one ortholog in a pre-defined set of organisms. """ def __init__(self, possiblyOrthologousOrganisms: 'Iterable[Organism] or KEGG.Organism.Group'): """ Chevron gene duplication extends simple gene duplication by limiting the possibly duplicated genes via a set of possibly orthologous organisms. In contrast to :class:`SimpleGeneDuplication`, this class has to be instantiated, using the aforementioned set of possibly orthologous organisms. Parameters ---------- possiblyOrthologousOrganisms : Iterable[Organism] or Organism.Group Organisms which will be searched for the occurence of orthologs, i.e. are considered ancestoral. Attributes ---------- self.possiblyOrthologousOrganisms : Iterable[Organism] Raises ------ ValueError If `possiblyOrthologousOrganisms` is of wrong type. Warnings -------- This takes much longer than :class:`SimpleGeneDuplication`, because additionally, each found paralog is searched for an ortholog in all organisms of the other group. However, if you set `returnMatches` == *False* and `ignoreDuplicatesOutsideSelf` == *False*, the search is aborted with the very first ortholog, which is much faster than getting all orthologs. Because in this model even a single orthologous match is enough to prove gene duplication, we do not necessarily have to fully search all organisms. """ if isinstance(possiblyOrthologousOrganisms, Group): self.possiblyOrthologousOrganisms = possiblyOrthologousOrganisms.organisms elif isinstance(possiblyOrthologousOrganisms, Iterable): self.possiblyOrthologousOrganisms = possiblyOrthologousOrganisms else: raise ValueError("'possiblyOrthologusOrganisms' must be of type Iterable or KEGG.Organism.Group")
[docs] def getEnzymes(self, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, returnMatches = False, ignoreDuplicatesOutsideSelf = False): """ Get gene-duplicated enzymes. Parameters ---------- enzymes : Set[Enzyme] or SubstanceEnzymeGraph Set of enzymes to be checked for gene duplication, or a graph. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. returnMatches : bool, optional If *True*, return not only `enzymes` that have homologs, but also which homologs they have. Useful for filtering for relevant homologs afterwards. ignoreDuplicatesOutsideSelf : bool, optional If *True*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside the set of enzymes searched for duplicates. This can, for example, serve to exclude duplicates in secondary metabolism. Returns ------- Set[Enzyme] or Dict[Element, Set[GeneID]] If returnMatches == *False*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition. If returnMatches == *True*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition, pointing to a set of gene IDs of the found homologs. Raises ------ ValueError If any organism does not exist. HTTPError If any gene does not exist. URLError If connection to KEGG fails. """ # graph -> set if isinstance(enzymes, SubstanceEnzymeGraph): enzymes = enzymes.getEnzymes() # get paralogs first possibleGeneDuplicates = SimpleGeneDuplication.getEnzymes(enzymes, eValue, returnMatches = returnMatches or ignoreDuplicatesOutsideSelf, ignoreDuplicatesOutsideSet = {enzyme.geneID for enzyme in enzymes}) # always need returnMatches if ignoreDuplicatesOutsideSelf is True! if returnMatches or ignoreDuplicatesOutsideSelf: possibleGeneDuplicatesSet = possibleGeneDuplicates.keys() else: possibleGeneDuplicatesSet = possibleGeneDuplicates if len( possibleGeneDuplicatesSet ) == 0: # nothing to do, because there are no paralogs return possibleGeneDuplicates duplicatedEnzymes = dict() # get orthologs geneIDs = {enzyme.geneID for enzyme in possibleGeneDuplicatesSet} if returnMatches or ignoreDuplicatesOutsideSelf: # need to get all orthologs matchingsDict = Database.getOrthologsBulk(geneIDs, self.possiblyOrthologousOrganisms, eValue) # GeneID -> List[ Matching ] for enzyme in possibleGeneDuplicatesSet: # add orthologs orthologousMatchings = matchingsDict.get(enzyme.geneID) if orthologousMatchings is not None: if len(orthologousMatchings) > 0: matches = [] for matching in orthologousMatchings: matches.extend(matching.matches) duplicatedEnzymes[enzyme] = {match.foundGeneID for match in matches} # add paralogs paralogousGeneIDs = possibleGeneDuplicates.get(enzyme.geneID) if paralogousGeneIDs is not None: if len(paralogousGeneIDs) > 0: currentDuplicates = duplicatedEnzymes.get(enzyme) if currentDuplicates is None: currentDuplicates = set() duplicatedEnzymes[enzyme] = currentDuplicates currentDuplicates.update(paralogousGeneIDs) else: # only interesting IF there are orthologs orthologousOrganismsDict = Database.hasOrthologsBulk(geneIDs, self.possiblyOrthologousOrganisms, eValue) # GeneID -> List[ organismAbbreviation ] for enzyme in possibleGeneDuplicatesSet: orthologousOrganisms = orthologousOrganismsDict.get(enzyme.geneID) if orthologousOrganisms is None: continue if len(orthologousOrganisms) > 0: duplicatedEnzymes[enzyme] = None #orthologousOrganisms if ignoreDuplicatesOutsideSelf: filteredDuplicatedEnzymes = dict() for enzyme, matchGeneIDs in duplicatedEnzymes.items(): matchesInSearchedSet = geneIDs.intersection( matchGeneIDs ) if len(matchesInSearchedSet) > 0: # some of the matches are in the set of enzymes to be checked for duplicates filteredDuplicatedEnzymes[enzyme] = matchesInSearchedSet duplicatedEnzymes = filteredDuplicatedEnzymes if returnMatches: return duplicatedEnzymes else: return set(duplicatedEnzymes.keys())
[docs] def getEnzymePairs(self, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, ignoreDuplicatesOutsideSelf = False, geneIdToEnzyme = None) -> Set[Tuple[Enzyme, Enzyme]]: """ Get gene-duplicated enzymes, in pairs of duplicates. If enzyme A is a duplicate of enzyme B and vice versa, this does not return duplicates, but returns only one pair, with the "smaller" enzyme as the first value. An enzyme is "smaller" if its gene ID string is "smaller". Parameters ---------- enzymes : Set[Enzyme] or SubstanceEnzymeGraph Set of enzymes to be checked for gene duplication, or a graph. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. ignoreDuplicatesOutsideSelf : bool, optional If *True*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside the set of enzymes searched for duplicates. This can, for example, serve to exclude duplicates in secondary metabolism. geneIdToEnzyme : Dict[GeneID, Enzyme], optional Dictionary for mapping each gene ID of every found duplicate to an enzyme object. If *None*, gets the enzyme from the database. This avoids the KeyError, but can cause a lot of network load. Returns ------- Set[Tuple[Enzyme, Enzyme]] Set of pairs of gene-duplicated enzymes, realised as tuples. The order is arbitrary. Raises ------ KeyError If `geneIdToEnzyme` is passed, but does not contain the gene ID of every duplicate. """ duplicatedEnzymeMatches = self.getEnzymes(enzymes, eValue, returnMatches = True, ignoreDuplicatesOutsideSelf = ignoreDuplicatesOutsideSelf) # expand matches of homologous gene IDs to pairs of duplicated enzymes. Which we can do (without wasting further resources) only here, because only here we have the geneID -> enzyme dict! duplicatedEnzymePairs = set() if geneIdToEnzyme is None: # need to get enzyme objects from database allGeneIDs = set() for geneIDs in duplicatedEnzymeMatches.values(): allGeneIDs.update( geneIDs ) geneIdToEnzyme = dict() for geneID, gene in Database.getGeneBulk(allGeneIDs).items(): enzyme = Enzyme.fromGene(gene) geneIdToEnzyme[geneID] = enzyme for enzymeA, geneIDs in duplicatedEnzymeMatches.items(): for geneID in geneIDs: enzymeB = geneIdToEnzyme[geneID] duplicatedEnzymePairs.add( (enzymeA, enzymeB) ) # filter symmetric duplicates deduplicatedEnzymePairs = set() for enzymeA, enzymeB in duplicatedEnzymePairs: if enzymeA <= enzymeB: deduplicatedEnzymePairs.add( (enzymeA, enzymeB) ) else: deduplicatedEnzymePairs.add( (enzymeB, enzymeA) ) return deduplicatedEnzymePairs
[docs] def filterEnzymes(self, substanceEnzymeGraph: SubstanceEnzymeGraph, eValue = defaultEValue, ignoreDuplicatesOutsideSelf = False) -> SubstanceEnzymeGraph: """ Remove all enzymes from a graph which have not been gene-duplicated. Parameters ---------- substanceEnzymeGraph : SubstanceEnzymeGraph Graph of enzymes to be checked for gene duplication. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. ignoreDuplicatesOutsideSelf : bool, optional If *True*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside the set of enzymes searched for duplicates. This can, for example, serve to exclude duplicates in secondary metabolism. Returns ------- SubstanceEnzymeGraph A copy of the `substanceEnzymeGraph` containing only enzymes which fulfil the conditions of this gene duplication definition. Raises ------ ValueError If any organism does not exist. HTTPError If any gene does not exist. URLError If connection to KEGG fails. """ graph = substanceEnzymeGraph.copy() possibleGeneDuplicates = self.getEnzymes(substanceEnzymeGraph, eValue, returnMatches = False, ignoreDuplicatesOutsideSelf = ignoreDuplicatesOutsideSelf) graph.removeAllEnzymesExcept( possibleGeneDuplicates ) return graph
[docs]class Neofunctionalisation(): def __init__(self, enzymeA: Enzyme, enzymeB: Enzyme): """ Evolutionary event of Neofunctionalisation between a pair of enzymes. The conditions for a neofunctionalisation are: - The enzyme's gene has been duplicated, according to a certain class of GeneDuplication. - The duplicated enzyme is associated with a different EC number than its duplicate. The order of the two enzymes has no meaning, it has been arbitrarily chosen to reflect the lexicographic order of their associated EC numbers. The enzyme posessing the "smallest" EC number comes first. This absolute ordering prevents duplicate events, because without an order there would have always been a second event with the exact same enzymes, but in swapped positions, because neofunctionalisation has no direction here and is, thus, symmetric. Parameters ---------- enzymeA : Enzyme An enzyme, which is a gene duplicate of `enzymeB`. The order is arbitrary. enzymeB : Enzyme An enzyme, which is a gene duplicate of `enzymeA`. The order is arbitrary. Attributes ---------- self.enzymePair : Tuple[Enzyme, Enzyme] Tuple of the two enzymes, sorted by the lexicographic order of their "smallest" EC number. Raises ------ ValueError If the enzymes are equal, have the same set of EC numbers, or one has no EC number. """ if enzymeA == enzymeB: raise ValueError('The enzymes must be unequal!') ecNumbersAset = enzymeA.ecNumbers ecNumbersBset = enzymeB.ecNumbers if ecNumbersAset == ecNumbersBset: raise ValueError('The enzymes must have differing EC numbers to be NEOfunctionalised!') if len(ecNumbersAset) == 0 or len(ecNumbersBset) == 0: raise ValueError('The enzymes have to be associated with at least one EC number!') ecNumbersA = list(ecNumbersAset) ecNumbersA.sort() ecNumbersB = list(ecNumbersBset) ecNumbersB.sort() if len(ecNumbersA) <= len(ecNumbersB): smallerSet = ecNumbersA biggerSet = ecNumbersB isAsmaller = True else: smallerSet = ecNumbersB biggerSet = ecNumbersA isAsmaller = False for index, ec1 in enumerate(smallerSet): ec2 = biggerSet[index] if ec1 == ec2: if index == len(smallerSet) - 1: # end of smaller set reached if isAsmaller: self.enzymePair = (enzymeA, enzymeB) else: self.enzymePair = (enzymeB, enzymeA) else: # not yet at the end, compare next indizes continue elif ec1 < ec2: if isAsmaller: self.enzymePair = (enzymeA, enzymeB) else: self.enzymePair = (enzymeB, enzymeA) else: if isAsmaller: self.enzymePair = (enzymeB, enzymeA) else: self.enzymePair = (enzymeA, enzymeB)
[docs] def getEnzymes(self) -> Tuple[Enzyme, Enzyme]: """ Get the pair of enzymes. Returns ------- Tuple[Enzyme, Enzyme] """ return self.enzymePair
[docs] def getEcNumbers(self) -> Tuple[Set[EcNumber], Set[EcNumber]]: """ Get the enzymes' EC numbers. Returns ------- Tuple[Set[EcNumber], Set[EcNumber]] Same order as in :func:`getEnzymes`. Because an enzyme could have multiple EC numbers, they are given as sets. """ return (self.enzymePair[0].ecNumbers, self.enzymePair[1].ecNumbers)
[docs] def getDifferingEcLevels(self) -> int: """ Get the maximum number of EC levels in which the enzymes' EC numbers differ. Returns ------- int Number of differing EC levels between the two enzymes' EC numbers, starting with the substrate-level. If an enzyme has multiple EC numbers, returns the biggest difference. For example 1.2.3.4 and 1.2.3.7 returns 1, while 1.2.3.4 and 1.8.9.10 returns 3. However, wildcards do **not** match numbers: 1.2.3.4 and 1.2.3.- returns 1! """ biggestDifference = 0 for ecA in self.getEcNumbers()[0]: for ecB in self.getEcNumbers()[1]: difference = 4 - ecA.matchingLevels(ecB, wildcardMatchesNumber = False) if difference > biggestDifference: biggestDifference = difference return biggestDifference
[docs] def isSameEcReaction(self) -> bool: """ Whether the enzymes' EC numbers describe a different reaction, or merely a different substrate Returns ------- bool *True*, if the biggest difference in EC levels of the two enzymes is still on the fourth (substrate) level. """ if self.getDifferingEcLevels() > 1: return False else: return True
[docs] def toHtml(self, short = False): """ Get the string representation as an HTML line. """ enzymePair = self.enzymePair ecPair = self.getEcNumbers() return '<td>' + enzymePair[0].toHtml() + '<td>' + ',&nbsp;'.join([ec.toHtml(short) for ec in sorted(ecPair[0])]) + '</td></td><td><-></td><td>'+ enzymePair[1].toHtml() + '<td>' + ',&nbsp;'.join([ec.toHtml(short) for ec in sorted(ecPair[1])]) + '</td></td>'
def __str__(self): enzymePair = self.enzymePair ecPair = self.getEcNumbers() return '(' + str(enzymePair[0]) + ' [' + ', '.join([str(ec) for ec in sorted(ecPair[0])]) + '],\t'+ str(enzymePair[1]) + ' [' + ', '.join([str(ec) for ec in sorted(ecPair[1])]) + '])' def __repr__(self): return self.__str__() def __eq__(self, other): if isinstance(self, other.__class__): return self.enzymePair == other.enzymePair return False def __ne__(self, other): return not self == other def __hash__(self): return self.enzymePair.__hash__() def __lt__(self, other): # sort by EC number first selfEnzyme1 = self.enzymePair[0] selfEnzyme2 = self.enzymePair[1] selfEnzyme1EcList = list(selfEnzyme1.ecNumbers) selfEnzyme2EcList = list(selfEnzyme2.ecNumbers) otherEnzyme1 = other.enzymePair[0] otherEnzyme2 = other.enzymePair[1] otherEnzyme1EcList = list(otherEnzyme1.ecNumbers) otherEnzyme2EcList = list(otherEnzyme2.ecNumbers) if selfEnzyme1EcList == otherEnzyme1EcList: if selfEnzyme2EcList == otherEnzyme2EcList: # then by gene ID if selfEnzyme1.uniqueID == otherEnzyme1.uniqueID: return selfEnzyme2.uniqueID < otherEnzyme2.uniqueID else: return selfEnzyme1.uniqueID < otherEnzyme1.uniqueID else: return selfEnzyme2EcList < otherEnzyme2EcList else: return selfEnzyme1EcList < otherEnzyme1EcList def __gt__(self, other): # sort by EC number first selfEnzyme1 = self.enzymePair[0] selfEnzyme2 = self.enzymePair[1] selfEnzyme1EcList = list(selfEnzyme1.ecNumbers) selfEnzyme2EcList = list(selfEnzyme2.ecNumbers) otherEnzyme1 = other.enzymePair[0] otherEnzyme2 = other.enzymePair[1] otherEnzyme1EcList = list(otherEnzyme1.ecNumbers) otherEnzyme2EcList = list(otherEnzyme2.ecNumbers) if selfEnzyme1EcList == otherEnzyme1EcList: if selfEnzyme2EcList == otherEnzyme2EcList: # then by gene ID if selfEnzyme1.uniqueID == otherEnzyme1.uniqueID: return selfEnzyme2.uniqueID > otherEnzyme2.uniqueID else: return selfEnzyme1.uniqueID > otherEnzyme1.uniqueID else: return selfEnzyme2EcList > otherEnzyme2EcList else: return selfEnzyme1EcList > otherEnzyme1EcList def __le__(self, other): # sort by EC number first selfEnzyme1 = self.enzymePair[0] selfEnzyme2 = self.enzymePair[1] selfEnzyme1EcList = list(selfEnzyme1.ecNumbers) selfEnzyme2EcList = list(selfEnzyme2.ecNumbers) otherEnzyme1 = other.enzymePair[0] otherEnzyme2 = other.enzymePair[1] otherEnzyme1EcList = list(otherEnzyme1.ecNumbers) otherEnzyme2EcList = list(otherEnzyme2.ecNumbers) if selfEnzyme1EcList == otherEnzyme1EcList: if selfEnzyme2EcList == otherEnzyme2EcList: # then by gene ID if selfEnzyme1.uniqueID == otherEnzyme1.uniqueID: return selfEnzyme2.uniqueID <= otherEnzyme2.uniqueID else: return selfEnzyme1.uniqueID <= otherEnzyme1.uniqueID else: return selfEnzyme2EcList <= otherEnzyme2EcList else: return selfEnzyme1EcList <= otherEnzyme1EcList def __ge__(self, other): # sort by EC number first selfEnzyme1 = self.enzymePair[0] selfEnzyme2 = self.enzymePair[1] selfEnzyme1EcList = list(selfEnzyme1.ecNumbers) selfEnzyme2EcList = list(selfEnzyme2.ecNumbers) otherEnzyme1 = other.enzymePair[0] otherEnzyme2 = other.enzymePair[1] otherEnzyme1EcList = list(otherEnzyme1.ecNumbers) otherEnzyme2EcList = list(otherEnzyme2.ecNumbers) if selfEnzyme1EcList == otherEnzyme1EcList: if selfEnzyme2EcList == otherEnzyme2EcList: # then by gene ID if selfEnzyme1.uniqueID == otherEnzyme1.uniqueID: return selfEnzyme2.uniqueID >= otherEnzyme2.uniqueID else: return selfEnzyme1.uniqueID >= otherEnzyme1.uniqueID else: return selfEnzyme2EcList >= otherEnzyme2EcList else: return selfEnzyme1EcList >= otherEnzyme1EcList
[docs]class NeofunctionalisedEnzymes(): def __init__(self, enzymes: Set[Enzyme], geneDuplicationModel: GeneDuplication, eValue = defaultEValue, ignoreDuplicatesOutsideSet: bool = True): """ Neofunctionalisation events among certain `enzymes`. Parameters ---------- enzymes : Set[Enzyme] Enzymes among which to test for neofunctionalisation. Neofunctionalisations involving enzymes outside this set are **not** reported. geneDuplicationModel : GeneDuplication The model of gene duplication to use. eValue : float, optional Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant. ignoreDuplicatesOutsideSet : bool, optional If *True*, any neofunctionalisation involving an enzyme outside the `enzymes` set is not reported. This helps to exclude secondary metabolism when examining core metabolism. Raises ------ ValueError If a gene duplication model is used which requires instantiation, but only its class was given. """ if geneDuplicationModel == ChevronGeneDuplication: raise ValueError("Chevron gene duplication model requires you to instantiate an object, parametrised with the set of possibly orthologous organisms.") elif geneDuplicationModel == SimpleGroupGeneDuplication: raise ValueError("Simple group gene duplication model requires you to instantiate an object, parametrised with the set of organisms belonging to the same group.") self._geneDuplicationModel = geneDuplicationModel self._neofunctionalisations = set() # get possibly gene-duplicated enzymes, pointing to their duplicates geneIdToEnzyme = dict() for enzyme in enzymes: geneIdToEnzyme[enzyme.geneID] = enzyme if ignoreDuplicatesOutsideSet is True: # restrict allowed gene duplicates to the set of passed enzymes. This helps to avoid parts of the metabolism outside the core metabolism if isinstance(geneDuplicationModel, ChevronGeneDuplication): duplicatedEnzymePairs = geneDuplicationModel.getEnzymePairs(enzymes, eValue, ignoreDuplicatesOutsideSelf = ignoreDuplicatesOutsideSet, geneIdToEnzyme = geneIdToEnzyme) else: duplicatedEnzymePairs = geneDuplicationModel.getEnzymePairs(enzymes, eValue, ignoreDuplicatesOutsideSet = set(geneIdToEnzyme.keys()), geneIdToEnzyme = geneIdToEnzyme) else: # you want to check for neofunctionalisation outside the passed enzymes (usually core metabolism) duplicatedEnzymePairs = geneDuplicationModel.getEnzymePairs(enzymes, eValue) # in all pairs of duplicated enzymes, find neofunctionalised ones for enzymePair in duplicatedEnzymePairs: try: neofunctionalisation = Neofunctionalisation(enzymePair[0], enzymePair[1]) self._neofunctionalisations.add( neofunctionalisation ) except ValueError: # obviously not neofunctionalised pass # ignore
[docs] def getNeofunctionalisations(self, minimumEcDifference: int = None) -> Set[Neofunctionalisation]: """ Get neofunctionalisation events between two enzymes each. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. Returns ------- Set[Neofunctionalisation] Set of possible neofunctionalisation events. """ if minimumEcDifference is not None and minimumEcDifference > 1: filteredNeofunctionalisations = set() for neofunctionalisation in self._neofunctionalisations: if neofunctionalisation.getDifferingEcLevels() >= minimumEcDifference: filteredNeofunctionalisations.add( neofunctionalisation ) return filteredNeofunctionalisations else: return self._neofunctionalisations.copy()
[docs] def getEnzymes(self, minimumEcDifference: int = None) -> Set[Enzyme]: """ Get all neofunctionalised enzymes. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. Returns ------- Set[Enzyme] Set of all possibly neofunctionalised enzymes, regardless of the real direction of neofunctionalisation, which we can not determine here. """ neofunctionalisedEnzymes = set() for neofunctionalisation in self.getNeofunctionalisations(minimumEcDifference): neofunctionalisedEnzymes.update( neofunctionalisation.getEnzymes() ) return neofunctionalisedEnzymes
[docs] def filterGraph(self, enzymeGraph: SubstanceEnzymeGraph, minimumEcDifference: int = None) -> SubstanceEnzymeGraph: """ Filter enzyme graph to only contain neofunctionalised enzymes. Parameters ---------- enzymeGraph : SubstanceEnzymeGraph The enzyme graph to filter. minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. Returns ------- SubstanceEnzymeGraph A copy of `enzymeGraph`, leaving only edges with a neofunctionalised enzyme as key. """ graph = enzymeGraph.copy() neofunctionalisedEnzymes = self.getEnzymes(minimumEcDifference) graph.removeAllEnzymesExcept( neofunctionalisedEnzymes ) return graph
[docs] def colourGraph(self, enzymeGraph: SubstanceEnzymeGraph, colour: Export.Colour = Export.Colour.GREEN, minimumEcDifference: int = None) -> SubstanceEnzymeGraph: """ Colour enzyme graph's neofunctionalised enzyme edges. Parameters ---------- enzymeGraph : SubstanceEnzymeGraph The enzyme graph to colour. colour : Export.Colour, optional The colour to use for edges with neofunctionalised enzymes as key. minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. Returns ------- SubstanceEnzymeGraph A copy of `enzymeGraph` in which edges with neofunctionalised enzymes as key have an additional colour attribute, see :func:`FEV_KEGG.Drawing.Export.addColourAttribute`. """ graph = enzymeGraph.copy() neofunctionalisedEnzymes = self.getEnzymes(minimumEcDifference) Export.addColourAttribute(graph, colour, nodes = False, edges = neofunctionalisedEnzymes) return graph
[docs]class FunctionChange(): def __init__(self, ecA: EcNumber, ecB: EcNumber): """ Possible evolutionary change of enzymatic function, from one EC number to another. The direction of change, or if it really happened, can not be determined here! The order of EC numbers in this object is arbitrarily chosen to reflect their lexicographic order. A function change resembles the possibility that the first EC number has evolutionarily changed into the second one. Or the other way around, since the direction of evolution can not be determined here. A function change can never have the same EC number twice, nor can the first be lexicographically "bigger" than the second, see the examples section. Parameters ---------- ecA: EcNumber ecB: EcNumber Must be lexicographically "bigger" than `ecA`. Can not be equal to `ecA`. Raises ------ ValueError If `ecA` is lexicographically "bigger" than, or equal to, `ecB`. """ if ecA >= ecB: raise ValueError("EC number 1 is bigger than or equal to EC number 2.") self.ecA = ecA self.ecB = ecB self.ecPair = (ecA, ecB)
[docs] @classmethod def fromNeofunctionalisation(cls, neofunctionalisation: Neofunctionalisation) -> Set['FunctionChange']: """ Create combinations of function changes from a `neofunctionalisation`. Parameters ---------- neofunctionalisation : Neofunctionalisation Returns ------- Set[FunctionChange] Set of function changes which might have been caused by the `neofunctionalisation`. Since an enzyme of a neofunctionalisation can have multiple EC numbers, all combinations of the two enzymes' EC numbers are formed and treated as separate possible function changes. Examples -------- A: 1 B: 2 = (1, 2) A: 1 B: 1, 2 = (1, 2) A: 1, 2 B: 1, 3 = (1, 3) (1, 2) (2, 3) A: 1, 2 B: 3, 4 = (1, 3) (1, 4) (2, 3) (2, 4) A: 1, 2 B: 1, 2, 3 = (1, 3) (2, 3) """ ecNumbersA, ecNumbersB = neofunctionalisation.getEcNumbers() # cross product questionableEcPairs = itertools.product(ecNumbersA, ecNumbersB) # filter illegal products functionChanges = set() for pair in questionableEcPairs: try: functionChanges.add(cls(pair[0], pair[1])) except ValueError: pass return functionChanges
[docs] def getDifferingEcLevels(self) -> int: """ Get the number of EC levels in which the EC numbers differ. Returns ------- int Number of differing EC levels between the two EC numbers, starting with the substrate-level. For example 1.2.3.4 and 1.2.3.7 returns 1, while 1.2.3.4 and 1.8.9.10 returns 3. However, wildcards do **not** match numbers: 1.2.3.4 and 1.2.3.- returns 1! """ return 4 - self.ecA.matchingLevels(self.ecB, wildcardMatchesNumber = False)
[docs] def toHtml(self, short = False): """ Get the string representation as an HTML line. """ return '<td>' + self.ecPair[0].toHtml(short) + '</td><td><-></td><td>' + self.ecPair[1].toHtml(short) + '</td>'
def __str__(self): return self.ecPair.__str__() def __repr__(self): return self.__str__() def __eq__(self, other): if isinstance(self, other.__class__): return self.ecPair == other.ecPair return False def __ne__(self, other): return not self == other def __hash__(self): return self.ecPair.__hash__() def __lt__(self, other): return self.ecPair < other.ecPair def __gt__(self, other): return self.ecPair > other.ecPair def __le__(self, other): return self.ecPair <= other.ecPair def __ge__(self, other): return self.ecPair >= other.ecPair
[docs]class NeofunctionalisedECs(): def __init__(self, neofunctionalisedEnzymes: NeofunctionalisedEnzymes): """ EC numbers which are affected by neofunctionalisation events. Parameters ---------- neofunctionalisedEnzymes : NeofunctionalisedEnzymes Neofunctionalisation events among certain enzymes. """ self._neofunctionalisedEnzymes = neofunctionalisedEnzymes
[docs] def getNeofunctionalisationsForFunctionChange(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Dict[FunctionChange, Set[Neofunctionalisation]]: """ Get neofunctionalsation events, keyed by a change of function between the two enzymes. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. minimumOrganismsCount : int, optional Minimum number of organisms which have to be involved in the neofunctionalisations of each function change. If *None*, there is no filtering due to organism involvement. For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc), finally, if `minimumOrganismsCount` <= 3, the function change 1->2 is returned. Returns ------- Dict[FunctionChange, Set[Neofunctionalisation]] Dictionary of function changes, pointing to a set of neofunctionalisations which might have caused them. Since an enzyme of a neofunctionalisation can have multiple EC numbers, all combinations of the two enzymes' EC numbers are formed and treated as separate possible function changes. The neofunctionalisation is then saved again for each function change, which obviously leads to duplicated neofunctionalisation objects. Examples -------- A: 1 B: 2 = (1, 2) A: 1 B: 1, 2 = (1, 2) A: 1, 2 B: 1, 3 = (1, 3) (1, 2) (2, 3) A: 1, 2 B: 3, 4 = (1, 3) (1, 4) (2, 3) (2, 4) A: 1, 2 B: 1, 2, 3 = (1, 3) (2, 3) """ neofunctionalisationForFunctionChange = dict() # get neofunctionalisations for neofunctionalisation in self._neofunctionalisedEnzymes.getNeofunctionalisations(minimumEcDifference): # split sets of EC numbers into pair-wise combinations functionChanges = FunctionChange.fromNeofunctionalisation(neofunctionalisation) for functionChange in functionChanges: currentSet = neofunctionalisationForFunctionChange.get(functionChange, None) if currentSet is None: currentSet = set() neofunctionalisationForFunctionChange[functionChange] = currentSet currentSet.add(neofunctionalisation) # filter function changes with neofunctionalised enzymes which stem from too few organisms if minimumOrganismsCount is not None: keysToDelete = [] for functionChange, neofunctionalisations in neofunctionalisationForFunctionChange.items(): enzymes = set() for neofunctionalisation in neofunctionalisations: enzymes.update( neofunctionalisation.getEnzymes() ) organisms = set() for enzyme in enzymes: organisms.add( enzyme.organismAbbreviation ) # enough occuring organisms? if not len(organisms) >= minimumOrganismsCount: keysToDelete.append( functionChange ) # delete keys which failed the test for key in keysToDelete: del neofunctionalisationForFunctionChange[key] return neofunctionalisationForFunctionChange
[docs] def getFunctionChanges(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Set[FunctionChange]: """ Get all possible changes of function between the two enzymes of every neofunctionalisation. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. minimumOrganismsCount : int, optional Minimum number of organisms which have to be involved in the neofunctionalisations of each function change. If *None*, there is no filtering due to organism involvement. Returns ------- Set[FunctionChange] Set of all function changes, which meet the criteria. """ return set( self.getNeofunctionalisationsForFunctionChange(minimumEcDifference, minimumOrganismsCount).keys() )
[docs] def getEnzymesForFunctionChange(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Dict[FunctionChange, Set[Enzyme]]: """ Get enzymes of neofunctionalisations, keyed by a possible change of function. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. minimumOrganismsCount : int, optional Minimum number of organisms which have to be involved in the neofunctionalisations of each function change. If *None*, there is no filtering due to organism involvement. For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc), finally, if `minimumOrganismsCount` <= 3, the function change 1->2 is returned. Returns ------- Dict[FunctionChange, Set[Enzyme]] Dictionary of function changes, pointing to a set of enzymes involved in the neofunctionalisations which might have caused the function change. This can lead to many duplicated enzymes. """ enzymesForFunctionChange = dict() for functionChange, neofunctionalisations in self.getNeofunctionalisationsForFunctionChange(minimumEcDifference, minimumOrganismsCount).items(): currentSet = enzymesForFunctionChange.get(functionChange, None) if currentSet is None: currentSet = set() enzymesForFunctionChange[functionChange] = currentSet for neofunctionalisation in neofunctionalisations: currentSet.update( neofunctionalisation.getEnzymes() ) return enzymesForFunctionChange
[docs] def getNeofunctionalisationsForEC(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Dict[EcNumber, Set[Neofunctionalisation]]: """ Get neofunctionalisation events, keyed by an EC number participating in the change of function between the two enzymes. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. minimumOrganismsCount : int, optional Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number. If *None*, there is no filtering due to organism involvement. This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change. For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc). Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported. However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3. Returns ------- Dict[EcNumber, Set[Neofunctionalisation]] Dictionary of EC numbers which are part of function changes, pointing to a set of neofunctionalisations which might have caused them. Very likely has duplicated neofunctionalisations, because there are always at least two EC numbers involved in a neofunctionalisation. """ neofunctionalisationForEC = dict() # get neofunctionalisations for neofunctionalisation in self._neofunctionalisedEnzymes.getNeofunctionalisations(minimumEcDifference): # split sets of EC numbers into pair-wise combinations functionChanges = FunctionChange.fromNeofunctionalisation(neofunctionalisation) for functionChange in functionChanges: # for each EC number of a function change, save neofunctionalisation for ec in functionChange.ecPair: currentSet = neofunctionalisationForEC.get(ec, None) if currentSet is None: currentSet = set() neofunctionalisationForEC[ec] = currentSet currentSet.add(neofunctionalisation) # filter ECs with neofunctionalised enzymes which stem from too few organisms if minimumOrganismsCount is not None: keysToDelete = [] for ec, neofunctionalisations in neofunctionalisationForEC.items(): enzymes = set() for neofunctionalisation in neofunctionalisations: enzymes.update( neofunctionalisation.getEnzymes() ) organisms = set() for enzyme in enzymes: organisms.add( enzyme.organismAbbreviation ) # enough occuring organisms? if not len(organisms) >= minimumOrganismsCount: keysToDelete.append( ec ) # delete keys which failed the test for key in keysToDelete: del neofunctionalisationForEC[key] return neofunctionalisationForEC
[docs] def getECs(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Set[EcNumber]: """ Get EC numbers participating in the change of function due to neofunctionalisations. They could also be called "neofunctionalised" EC numbers. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. minimumOrganismsCount : int, optional Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number. If *None*, there is no filtering due to organism involvement. This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change. For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc). Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported. However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3. Returns ------- Set[EcNumber] Set of EC numbers which are part of function changes which possibly happened due to neofunctionalisations. """ return set( self.getNeofunctionalisationsForEC(minimumEcDifference, minimumOrganismsCount).keys() )
[docs] def getEnzymesForEC(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Dict[EcNumber, Set[Enzyme]]: """ Get enzymes of neofunctionalisations, keyed by an EC number of a possible function change. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. minimumOrganismsCount : int, optional Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number. If *None*, there is no filtering due to organism involvement. This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change. For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc). Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported. However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3. Returns ------- Dict[EcNumber, Set[Enzyme]] Dictionary of EC numbers, pointing to a set of enzymes involved in the neofunctionalisations which might have caused the function changes the EC number is part of. This can lead to many duplicated enzymes. """ enzymesForEC = dict() for ec, neofunctionalisations in self.getNeofunctionalisationsForEC(minimumEcDifference, minimumOrganismsCount).items(): currentSet = enzymesForEC.get(ec, None) if currentSet is None: currentSet = set() enzymesForEC[ec] = currentSet for neofunctionalisation in neofunctionalisations: currentSet.update( neofunctionalisation.getEnzymes() ) return enzymesForEC
[docs] def filterGraph(self, ecGraph: SubstanceEcGraph, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> SubstanceEcGraph: """ Filter EC graph to only contain "neofunctionalised" EC numbers. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. minimumOrganismsCount : int, optional Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number. If *None*, there is no filtering due to organism involvement. This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change. For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc). Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported. However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3. Returns ------- SubstanceEcGraph A copy of `ecGraph`, leaving only edges with a "neofunctionalised" EC as key. """ graph = ecGraph.copy() neofunctionalisedECs = self.getECs(minimumEcDifference, minimumOrganismsCount) graph.removeAllECsExcept( neofunctionalisedECs ) return graph
[docs] def colourGraph(self, ecGraph: SubstanceEcGraph, colour: Export.Colour = Export.Colour.GREEN, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> SubstanceEcGraph: """ Colour EC graph's "neofunctionalised" EC number edges. Parameters ---------- minimumEcDifference : int, optional May only be one of [1, 2, 3, 4]. If *None* or *1*, all neofunctionalisations are returned. If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels. They then describe a different reaction, instead of only a different substrate. For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is. minimumOrganismsCount : int, optional Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number. If *None*, there is no filtering due to organism involvement. This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change. For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc). Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported. However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3. Returns ------- SubstanceEcGraph A copy of `ecGraph` in which edges with "neofunctionalised" ECs as key have an additional colour attribute, see :func:`FEV_KEGG.Drawing.Export.addColourAttribute`. """ graph = ecGraph.copy() neofunctionalisedECs = self.getECs(minimumEcDifference, minimumOrganismsCount) Export.addColourAttribute(graph, colour, nodes = False, edges = neofunctionalisedECs) return graph