Source code for FEV_KEGG.Evolution.Events

from typing import Set, Dict, Tuple

from FEV_KEGG.Graph.Elements import Enzyme, EcNumber
from FEV_KEGG.Graph.SubstanceGraphs import SubstanceEnzymeGraph, SubstanceEcGraph
from FEV_KEGG.KEGG import Database
from builtins import set
from FEV_KEGG.KEGG.Organism import Group
from _collections_abc import Iterable
from FEV_KEGG import settings
import itertools
from FEV_KEGG.Drawing import Export

defaultEValue = settings.defaultEvalue
"""
Default threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
This can be overridden in each relevant method's `eValue` parameter in this module.
"""


[docs]class GeneFunctionConservation(object):
    """
    Evolutionary event of conserving a gene function (EC number) between a pair of arbitrary ancestor and descendant.
    
    The conditions for a gene function conservation are:
        - The EC number has been conserved, along the way from an older group of organisms to a newer one.
    """
[docs]    @staticmethod
    def getECs(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> Set[EcNumber]:
        """
        Get EC numbers which have been conserved between ancestor and descendant, existing in both.
        
        Parameters
        ----------
        ancestorEcGraph : SubstanceEcGraph
        descendantEcGraph : SubstanceEcGraph
        
        Returns
        -------
        Set[EcNumber]
            Set of EC numbers which occur in the ancestor's EC graph and in the decendants's, i.e. EC numbers which are conserved in the descendant.
        """
        conservedECs = ancestorEcGraph.getECs()
        conservedECs.intersection_update(descendantEcGraph.getECs())
        return conservedECs
    
[docs]    @staticmethod
    def getGraph(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> SubstanceEcGraph:
        """
        Get graph containing EC numbers which have been conserved between ancestor and descendant, existing in both.
        
        Parameters
        ----------
        ancestorEcGraph : SubstanceEcGraph
        descendantEcGraph : SubstanceEcGraph
        
        Returns
        -------
        SubstanceEcGraph
            Graph of EC numbers which occur in the ancestor's EC graph, and in the decendants's, i.e. EC numbers which are conserved in the descendant.
            Substance-EC-product edges are only included if both graphs, ancestor and descendant, have both nodes, substrate and product.
        """
        conservedGraph = ancestorEcGraph.intersection(descendantEcGraph, addCount=False)
        conservedGraph.removeIsolatedNodes()
        return conservedGraph
    
    
[docs]class GeneFunctionAddition(object):
    """
    Evolutionary event of adding a gene function (EC number) between a pair of arbitrary ancestor and descendant.
    
    The conditions for a gene function addition are:
        - The EC number has been added, from an unknown origin, along the way from an older group of organisms to a newer one.
    """
[docs]    @staticmethod
    def getECs(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> Set[EcNumber]:
        """
        Get EC numbers which have been added between ancestor and descendant, existing only in the descendant.
        
        Parameters
        ----------
        ancestorEcGraph : SubstanceEcGraph
        descendantEcGraph : SubstanceEcGraph
        
        Returns
        -------
        Set[EcNumber]
            Set of EC numbers which occur in the descendant's EC graph, but not in the ancestor's, i.e. EC numbers which are new to the descendant.
        """
        addedECs = descendantEcGraph.getECs()
        addedECs.difference_update(ancestorEcGraph.getECs())
        return addedECs
    
[docs]    @staticmethod
    def getGraph(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> SubstanceEcGraph:
        """
        Get graph containing EC numbers which have been added between ancestor and descendant, existing only in the descendant.
        
        Parameters
        ----------
        ancestorEcGraph : SubstanceEcGraph
        descendantEcGraph : SubstanceEcGraph
        
        Returns
        -------
        SubstanceEcGraph
            Graph of EC numbers which occur in the descendant's EC graph, but not in the ancestor's, i.e. EC numbers which are new in the descendant.
            Substance-EC-product edges are only included if both graphs, ancestor and descendant, have both nodes, substrate and product.
        """
        addedGraph = descendantEcGraph.difference(ancestorEcGraph, subtractNodes=False)
        addedGraph.removeIsolatedNodes()
        return addedGraph


[docs]class GeneFunctionLoss(object):
    """
    Evolutionary event of losing a gene function (EC number) between a pair of arbitrary ancestor and descendant.
    
    The conditions for a gene function loss are:
        - The EC number has been lost, along the way from an older group of organisms to a newer one.
    """
[docs]    @staticmethod
    def getECs(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> Set[EcNumber]:
        """
        Get EC numbers which have been lost between ancestor and descendant, existing only in the ancestor.
        
        Parameters
        ----------
        ancestorEcGraph : SubstanceEcGraph
        descendantEcGraph : SubstanceEcGraph
        
        Returns
        -------
        Set[EcNumber]
            Set of EC numbers which occur in the ancestor's EC graph, but not in the decendants's, i.e. EC numbers which are lost to the descendant.
        """
        lostECs = ancestorEcGraph.getECs()
        lostECs.difference_update(descendantEcGraph.getECs())
        return lostECs
    
[docs]    @staticmethod
    def getGraph(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> SubstanceEcGraph:
        """
        Get graph containing EC numbers which have been lost between ancestor and descendant, existing only in the ancestor.
        
        Parameters
        ----------
        ancestorEcGraph : SubstanceEcGraph
        descendantEcGraph : SubstanceEcGraph
        
        Returns
        -------
        SubstanceEcGraph
            Graph of EC numbers which occur in the ancestor's EC graph, but not in the decendants's, i.e. EC numbers which are lost in the descendant.
            Substance-EC-product edges are only included if both graphs, ancestor and descendant, have both nodes, substrate and product.
        """
        lostGraph = ancestorEcGraph.difference(descendantEcGraph, subtractNodes=False)
        lostGraph.removeIsolatedNodes()
        return lostGraph


[docs]class GeneFunctionDivergence(object):
    """
    Evolutionary event of diverging (adding or losing) a gene function (EC number) between a pair of arbitrary ancestor and descendant.
    
    The conditions for a gene function divergence are:
        - The EC number exists in an older group of organisms, but not in a newer one, or the other way around.
    """
[docs]    @staticmethod
    def getECs(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> Set[EcNumber]:
        """
        Get EC numbers which have diverged between ancestor and descendant, existing only in either one of them.
        
        Obviously, `ancestorEcGraph` and `descendantEcGraph` can be swapped here without changing the result.
        
        Parameters
        ----------
        ancestorEcGraph : SubstanceEcGraph
        descendantEcGraph : SubstanceEcGraph
        
        Returns
        -------
        Set[EcNumber]
            Set of EC numbers which occur in the ancestor's EC graph, but not in the decendants's and vice versa, i.e. EC numbers which only exist in either one of the organism groups.
        """
        divergedECs = ancestorEcGraph.getECs()
        divergedECs.symmetric_difference(descendantEcGraph.getECs())
        return divergedECs
    
[docs]    @staticmethod
    def getGraph(ancestorEcGraph: SubstanceEcGraph, descendantEcGraph: SubstanceEcGraph) -> SubstanceEcGraph:
        """
        Get graph containing EC numbers which have diverged between ancestor and descendant, existing only in either one of them.
        
        Parameters
        ----------
        ancestorEcGraph : SubstanceEcGraph
        descendantEcGraph : SubstanceEcGraph
        
        Returns
        -------
        SubstanceEcGraph
            Graph of EC numbers which occur in the ancestor's EC graph, but not in the decendants's and vice versa, i.e. EC numbers which only exist in either one of the organism groups.
            Substance-EC-product edges are only included if both graphs, ancestor and descendant, have both nodes, substrate and product.
        """
        addedGraph = GeneFunctionAddition.getGraph(ancestorEcGraph, descendantEcGraph)
        lostGraph = GeneFunctionLoss.getGraph(ancestorEcGraph, descendantEcGraph)
        divergedGraph = addedGraph.union(lostGraph, addCount=False)
        return divergedGraph






[docs]class GeneDuplication(object): 
    """
    Abstract class for any type of gene duplication.
    """

[docs]class SimpleGroupGeneDuplication(GeneDuplication):
    """
    Evolutionary event of duplicating a gene, regardless of ancestoral bonds in the comparison group.
    
    The conditions for a 'simple group' gene duplication are:
        - The gene has at least one homolog within the set of organisms its organism belongs to.
    """
    def __init__(self, sameGroupOrganisms: 'Iterable[Organism] or KEGG.Organism.Group'):
        """
        Simple group gene duplication extends simple gene duplication by expanding the term 'paralog' to every organism in the set of organisms the gene's organism blongs to.
        In contrast to :class:`SimpleGeneDuplication`, this class has to be instantiated, using the aforementioned set of organisms belonging to each other.
        This would usually be a :class:`FEV_KEGG.KEGG.Organism.Group` of the same :class:`FEV_KEGG.Evolution.Clade.Clade`. 
        
        Parameters
        ----------
        sameGroupOrganisms : Iterable[Organism] or Organism.Group
            Organisms which will be searched for the occurence of homologs, i.e. are considered "semi-paralogously" related.
        
        Attributes
        ----------
        self.sameGroupOrganisms : Iterable[Organism]
        
        Raises
        ------
        ValueError
            If `sameGroupOrganisms` is of wrong type.
        
        Warnings
        --------
        This takes much longer than :class:`SimpleGeneDuplication`, because each sought gene is compared between all organisms of the same group, not only within its own organism.
        """
        if isinstance(sameGroupOrganisms, Group):
            self.sameGroupOrganisms = sameGroupOrganisms.organisms
        elif isinstance(sameGroupOrganisms, Iterable):
            self.sameGroupOrganisms = sameGroupOrganisms
        else:
            raise ValueError("'sameGroupOrganisms' must be of type Iterable or KEGG.Organism.Group")
    
[docs]    def getEnzymes(self, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, returnMatches = False, ignoreDuplicatesOutsideSet = None):
        """
        Get gene-duplicated enzymes.
        
        Parameters
        ----------
        enzymes : Set[Enzyme] or SubstanceEnzymeGraph
            Set of enzymes to be checked for gene duplication, or a graph.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        returnMatches : bool, optional
            If *True*, return not only `enzymes` that have homologs, but also which homologs they have. Useful for filtering for relevant homologs afterwards.
        ignoreDuplicatesOutsideSet : Set[GeneID], optional
            If *None*, report all found duplicates.
            If not *None*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set.
            This can, for example, serve to exclude duplicates in secondary metabolism.
        
        Returns
        -------
        Set[Enzyme] or Dict[Element, Set[GeneID]]
            If returnMatches == *False*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition.
            If returnMatches == *True*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition, pointing to a set of gene IDs of the found homologs.
        
        Raises
        ------
        ValueError
            If any organism does not exist.
        HTTPError
            If any gene does not exist.
        URLError
            If connection to KEGG fails.
        """
        if isinstance(enzymes, SubstanceEnzymeGraph):
            enzymes = enzymes.getEnzymes()
        
        # find gene duplicated enzymes according to simple gene duplication
        shallReturnMatches = returnMatches or (ignoreDuplicatesOutsideSet is not None)
        simpleGeneDuplicates = SimpleGeneDuplication.getEnzymes(enzymes, eValue, returnMatches = shallReturnMatches, ignoreDuplicatesOutsideSet = ignoreDuplicatesOutsideSet)
        
        if returnMatches or ignoreDuplicatesOutsideSet is not None:
            simpleGeneDuplicatesSet = simpleGeneDuplicates.keys()
        else:
            simpleGeneDuplicatesSet = simpleGeneDuplicates
        
        duplicatedEnzymes = dict()
        
        # those enzymes already duplicated according to simple gene duplication do not have to be tested again
        soughtEnzymes = enzymes.copy()
        soughtEnzymes.difference_update( simpleGeneDuplicatesSet )
            
        # get orthologs
        geneIDs = [enzyme.geneID for enzyme in soughtEnzymes]
        
        if returnMatches or ignoreDuplicatesOutsideSet is not None: # need to get all orthologs
            
            matchingsDict = Database.getOrthologsBulk(geneIDs, self.sameGroupOrganisms, eValue) # GeneID -> List[ Matching ]
            
            for enzyme in enzymes: # for all enzymes, even the ones already found as paralogs
                
                # add paralogs
                paralogousGeneIDs = simpleGeneDuplicates.get(enzyme.geneID)
                if paralogousGeneIDs is not None:
                    
                    if len(paralogousGeneIDs) > 0:
                        
                        duplicatedEnzymes[enzyme] = paralogousGeneIDs
                    
                    continue # can not be in orthologs if it was already paralogous
                
                # add orthologs
                orthologousMatchings = matchingsDict.get(enzyme.geneID)
                if orthologousMatchings is not None:
                    
                    if len(orthologousMatchings) > 0:
                        
                        matches = []
                        for matching in orthologousMatchings:
                            matches.extend(matching.matches)
                        
                        matchedGeneIDs = {match.foundGeneID for match in matches}
                        
                        if len(matchedGeneIDs) > 0:
                            currentDuplicates = duplicatedEnzymes.get(enzyme)
                            
                            if currentDuplicates is None:
                                currentDuplicates = set()
                                duplicatedEnzymes[enzyme] = currentDuplicates
                                
                            currentDuplicates.update(matchedGeneIDs)
            
        else: # only interesting IF there are orthologs
        
            orthologousOrganismsDict = Database.hasOrthologsBulk(geneIDs, self.sameGroupOrganisms, eValue) # GeneID -> List[ organismAbbreviation ]
            
            # regarding each enzyme sought
            for enzyme in soughtEnzymes:
                # count orthologous GeneIDs
                orthologousOrganisms = orthologousOrganismsDict.get(enzyme.geneID)
                if orthologousOrganisms is None:
                    continue
                if len(orthologousOrganisms) > 0: # has at least one ortholog in any other organism
                    duplicatedEnzymes[enzyme] = None #orthologousOrganisms
        
        if ignoreDuplicatesOutsideSet is not None:
            filteredDuplicatedEnzymes = dict()
            
            for enzyme, matchGeneIDs in duplicatedEnzymes.items():
                
                matchesInSearchedSet = ignoreDuplicatesOutsideSet.intersection( matchGeneIDs )
                
                if len(matchesInSearchedSet) > 0: # some of the matches are in the set of enzymes to be checked for duplicates 
                    filteredDuplicatedEnzymes[enzyme] = matchesInSearchedSet
            
            duplicatedEnzymes = filteredDuplicatedEnzymes        
        
        if returnMatches:
            return duplicatedEnzymes
        else:
            return set(duplicatedEnzymes.keys())
    
[docs]    def getEnzymePairs(self, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, ignoreDuplicatesOutsideSet = None, geneIdToEnzyme = None) -> Set[Tuple[Enzyme, Enzyme]]:
        """
        Get gene-duplicated enzymes, in pairs of duplicates.
        
        If enzymeA is a duplicate of enzymeB and vice versa, this returns symmetric duplicates of the form (enzymeA, enzymeB) and (enzymeB, enzymeA).
        
        Parameters
        ----------
        enzymes : Set[Enzyme] or SubstanceEnzymeGraph
            Set of enzymes to be checked for gene duplication, or a graph.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        ignoreDuplicatesOutsideSet : Set[GeneID], optional
            If *None*, report all found duplicates.
            If not *None*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set.
            This can, for example, serve to exclude duplicates in secondary metabolism.
        geneIdToEnzyme : Dict[GeneID, Enzyme], optional
            Dictionary for mapping each gene ID of every found duplicate to an enzyme object.
            If *None*, gets the enzyme from the database. This avoids the KeyError, but can cause a lot of network load.
        
        Returns
        -------
        Set[Tuple[Enzyme, Enzyme]]
            Set of pairs of gene-duplicated enzymes, realised as tuples. The order is arbitrary and there will almost certainly be 100% duplicates.
        
        Raises
        ------
        KeyError
            If `geneIdToEnzyme` is passed, but does not contain the gene ID of every duplicate.
        """
        duplicatedEnzymeMatches = self.getEnzymes(enzymes, eValue, returnMatches = True, ignoreDuplicatesOutsideSet = ignoreDuplicatesOutsideSet)
        
        # expand matches of homologous gene IDs to pairs of duplicated enzymes. Which we can do (without wasting further resources) only here, because only here we have the geneID -> enzyme dict!
        duplicatedEnzymePairs = set()
        
        
        if geneIdToEnzyme is None: # need to get enzyme objects from database
            allGeneIDs = set()
            for geneIDs in duplicatedEnzymeMatches.values():
                allGeneIDs.update( geneIDs )
            
            geneIdToEnzyme = dict()
            
            for geneID, gene in Database.getGeneBulk(allGeneIDs).items():
                enzyme = Enzyme.fromGene(gene)
                geneIdToEnzyme[geneID] = enzyme
            
        
        for enzymeA, geneIDs in duplicatedEnzymeMatches.items():
            
            for geneID in geneIDs:
                
                enzymeB = geneIdToEnzyme[geneID]
                
                duplicatedEnzymePairs.add( (enzymeA, enzymeB) )
        
        return duplicatedEnzymePairs
    
[docs]    def filterEnzymes(self, substanceEnzymeGraph: SubstanceEnzymeGraph, eValue = defaultEValue) -> SubstanceEnzymeGraph:
        """
        Remove all enzymes from a graph which have not been gene-duplicated.
        
        Parameters
        ----------
        substanceEnzymeGraph : SubstanceEnzymeGraph
            Graph of enzymes to be checked for gene duplication.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        
        Returns
        -------
        SubstanceEnzymeGraph
            A copy of the `substanceEnzymeGraph` containing only enzymes which fulfil the conditions of this gene duplication definition.
        
        Raises
        ------
        ValueError
            If any organism does not exist.
        HTTPError
            If any gene does not exist.
        URLError
            If connection to KEGG fails.
        """
        graph = substanceEnzymeGraph.copy()
        possibleGeneDuplicates = self.getEnzymes(substanceEnzymeGraph, eValue)
        graph.removeAllEnzymesExcept( possibleGeneDuplicates )
        return graph


[docs]class SimpleGeneDuplication(GeneDuplication):
    """
    Evolutionary event of duplicating a gene, regardless of ancestoral bonds.
    
    The conditions for a 'simple' gene duplication are:
        - The gene has at least one paralog.
    """    
[docs]    @staticmethod
    def getEnzymes(enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, returnMatches = False, ignoreDuplicatesOutsideSet = None, preCalculatedEnzymes = None):
        """
        Get gene-duplicated enzymes.
        
        Parameters
        ----------
        enzymes : Set[Enzyme] or SubstanceEnzymeGraph
            Set of enzymes to be checked for gene duplication, or a graph.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        returnMatches : bool, optional
            If *True*, return not only `enzymes` that have homologs, but also which homologs they have. Useful for filtering for relevant homologs afterwards.
        ignoreDuplicatesOutsideSet : Set[GeneID] or *True*, optional
            If *None*, report all found duplicates.
            If *True*, automatically restrict to all enzymes in `substanceEnzymeGraph`.
            If a set, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set. Beware, the set has to contain the enzymes' gene ID!
            This can, for example, serve to exclude duplicates in secondary metabolism.
        
        Returns
        -------
        Set[Enzyme] or Dict[Element, Set[GeneID]]
            If returnMatches == *False*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition.
            If returnMatches == *True*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition, pointing to a set of gene IDs of the found homologs.
        
        Raises
        ------
        ValueError
            If any organism does not exist.
        HTTPError
            If any gene does not exist.
        URLError
            If connection to KEGG fails.
        """
        if isinstance(enzymes, SubstanceEnzymeGraph):
            enzymes = enzymes.getEnzymes()
        
        
        if preCalculatedEnzymes is not None:
            
            if returnMatches is True:
                result = dict()
            else:
                result = set()
            
            # filter preCalculatedEnzymes by enzymes
            for enzyme, duplicatesSet in preCalculatedEnzymes.items():
                
                if enzyme in enzymes:
                        
                    validDuplicates = set()
                    for duplicate in duplicatesSet:
                        
                        if ignoreDuplicatesOutsideSet is None or duplicate.geneID in ignoreDuplicatesOutsideSet:
                            validDuplicates.add( duplicate.geneID )
                    
                    if len(validDuplicates) > 0:
                        
                        if returnMatches is True:
                            result[enzyme] = validDuplicates
                        
                        else:
                            result.add( enzyme )
            
            return result
        
        
        possibleGeneDuplicates = dict()
        
        geneIDs = {enzyme.geneID for enzyme in enzymes}
        
        matchingsDict = Database.getParalogsBulk(geneIDs, eValue)
        
        for enzyme in enzymes:
            matching = matchingsDict.get(enzyme.geneID, None)
            
            if matching is None:
                print( 'WARNING: data for GeneID ' + str(enzyme.geneID) + ' could not be downloaded. Maybe you want to exclude this erroneous organism completely, before it skews statistics? See quirks.py')
                continue
            
            paralogs = matching.matches
            if len(paralogs) > 0:
                possibleGeneDuplicates[enzyme] = [match.foundGeneID for match in paralogs]
        
        if ignoreDuplicatesOutsideSet is True:
            ignoreDuplicatesOutsideSet = geneIDs
        
        if ignoreDuplicatesOutsideSet is not None and len(ignoreDuplicatesOutsideSet) > 0:
            filteredPossibleGeneDuplicates = dict()
            
            for enzyme, matchGeneIDs in possibleGeneDuplicates.items():
                
                # which genes have been found AND are in the set of relevant duplicates?
                matchesInSearchedSet = ignoreDuplicatesOutsideSet.intersection( matchGeneIDs )
                
                if len(matchesInSearchedSet) > 0: # some of the matches are in the set of relevant duplicates
                    filteredPossibleGeneDuplicates[enzyme] = matchesInSearchedSet
            
            possibleGeneDuplicates = filteredPossibleGeneDuplicates
        
        if returnMatches:
            return possibleGeneDuplicates
        else:
            return set(possibleGeneDuplicates.keys())
    
[docs]    @classmethod
    def getEnzymePairs(cls, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, ignoreDuplicatesOutsideSet = None, geneIdToEnzyme = None, preCalculatedEnzymes = None) -> Set[Tuple[Enzyme, Enzyme]]:
        """
        Get gene-duplicated enzymes, in pairs of duplicates.
        
        If enzyme A is a duplicate of enzyme B and vice versa, this does not return duplicates, but returns only one pair, with the "smaller" enzyme as the first value. An enzyme is "smaller" if its gene ID string is "smaller".
        
        Parameters
        ----------
        enzymes : Set[Enzyme] or SubstanceEnzymeGraph
            Set of enzymes to be checked for gene duplication, or a graph.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        ignoreDuplicatesOutsideSet : Set[GeneID] or *True*, optional
            If *None*, report all found duplicates.
            If *True*, automatically restrict to all enzymes in `substanceEnzymeGraph`.
            If a set, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set. Beware, the set has to contain the enzymes' gene ID!
            This can, for example, serve to exclude duplicates in secondary metabolism.
        geneIdToEnzyme : Dict[GeneID, Enzyme], optional
            Dictionary for mapping each gene ID of every found duplicate to an enzyme object.
            If *None*, gets the enzyme from the database. This avoids the KeyError, but can cause a lot of network load.
        
        Returns
        -------
        Set[Tuple[Enzyme, Enzyme]]
            Set of pairs of gene-duplicated enzymes, realised as tuples. The order is arbitrary.
        
        Raises
        ------
        KeyError
            If `geneIdToEnzyme` is passed, but does not contain the gene ID of every duplicate.
        """
        duplicatedEnzymeMatches = cls.getEnzymes(enzymes, eValue, returnMatches = True, ignoreDuplicatesOutsideSet = ignoreDuplicatesOutsideSet, preCalculatedEnzymes = preCalculatedEnzymes)
        
        # expand matches of homologous gene IDs to pairs of duplicated enzymes. Which we can do (without wasting further resources) only here, because only here we have the geneID -> enzyme dict!
        duplicatedEnzymePairs = set()
        
        
        if geneIdToEnzyme is None: # need to get enzyme objects from database
            allGeneIDs = set()
            for geneIDs in duplicatedEnzymeMatches.values():
                allGeneIDs.update( geneIDs )
            
            geneIdToEnzyme = dict()
            
            for geneID, gene in Database.getGeneBulk(allGeneIDs).items():
                enzyme = Enzyme.fromGene(gene)
                geneIdToEnzyme[geneID] = enzyme
            
        
        for enzymeA, geneIDs in duplicatedEnzymeMatches.items():
            
            for geneID in geneIDs:
                
                enzymeB = geneIdToEnzyme[geneID]
                
                duplicatedEnzymePairs.add( (enzymeA, enzymeB) )
        
        # filter symmetric duplicates
        deduplicatedEnzymePairs = set()
        for enzymeA, enzymeB in duplicatedEnzymePairs:
            if enzymeA <= enzymeB:
                deduplicatedEnzymePairs.add( (enzymeA, enzymeB) )
            else:
                deduplicatedEnzymePairs.add( (enzymeB, enzymeA) )
        
        return deduplicatedEnzymePairs
        
    
[docs]    @classmethod
    def filterEnzymes(cls, substanceEnzymeGraph: SubstanceEnzymeGraph, eValue = defaultEValue, ignoreDuplicatesOutsideSet = None, preCalculatedEnzymes = None) -> SubstanceEnzymeGraph:
        """
        Remove all enzymes from a graph which have not been gene-duplicated.
        
        Parameters
        ----------
        substanceEnzymeGraph : SubstanceEnzymeGraph
            Graph of enzymes to be checked for gene duplication.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        ignoreDuplicatesOutsideSet : Set[GeneID] or *True*, optional
            If *None*, report all found duplicates.
            If *True*, automatically restrict to all enzymes in `substanceEnzymeGraph`.
            If a set, count only such enzymes as gene duplicated, which have at least one of their duplicates inside this set. Beware, the set has to contain the enzymes' gene ID!
            This can, for example, serve to exclude duplicates in secondary metabolism.
        
        Returns
        -------
        SubstanceEnzymeGraph
            A copy of the `substanceEnzymeGraph` containing only enzymes which fulfil the conditions of this gene duplication definition.
        
        Raises
        ------
        ValueError
            If any organism does not exist.
        HTTPError
            If any gene does not exist.
        URLError
            If connection to KEGG fails.
        """
        graph = substanceEnzymeGraph.copy()
        possibleGeneDuplicates = cls.getEnzymes(substanceEnzymeGraph, eValue, returnMatches = False, ignoreDuplicatesOutsideSet = ignoreDuplicatesOutsideSet, preCalculatedEnzymes = preCalculatedEnzymes)
        graph.removeAllEnzymesExcept( possibleGeneDuplicates )
        return graph
        
    

[docs]class ChevronGeneDuplication(GeneDuplication):
    """
    Evolutionary event of duplicating a gene, in dependence of a certain ancestoral bond.
    
    The conditions for a 'chevron' gene duplication are:
        - The gene has at least one paralog.
        - The gene has at least one ortholog in a pre-defined set of organisms.
    """
    def __init__(self, possiblyOrthologousOrganisms: 'Iterable[Organism] or KEGG.Organism.Group'):
        """
        Chevron gene duplication extends simple gene duplication by limiting the possibly duplicated genes via a set of possibly orthologous organisms.
        In contrast to :class:`SimpleGeneDuplication`, this class has to be instantiated, using the aforementioned set of possibly orthologous organisms. 
        
        Parameters
        ----------
        possiblyOrthologousOrganisms : Iterable[Organism] or Organism.Group
            Organisms which will be searched for the occurence of orthologs, i.e. are considered ancestoral.
        
        Attributes
        ----------
        self.possiblyOrthologousOrganisms : Iterable[Organism]
        
        Raises
        ------
        ValueError
            If `possiblyOrthologousOrganisms` is of wrong type.
        
        Warnings
        --------
        This takes much longer than :class:`SimpleGeneDuplication`, because additionally, each found paralog is searched for an ortholog in all organisms of the other group.
        However, if you set `returnMatches` == *False* and `ignoreDuplicatesOutsideSelf` == *False*, the search is aborted with the very first ortholog, which is much faster than getting all orthologs.
        Because in this model even a single orthologous match is enough to prove gene duplication, we do not necessarily have to fully search all organisms.
        """
        if isinstance(possiblyOrthologousOrganisms, Group):
            self.possiblyOrthologousOrganisms = possiblyOrthologousOrganisms.organisms
        elif isinstance(possiblyOrthologousOrganisms, Iterable):
            self.possiblyOrthologousOrganisms = possiblyOrthologousOrganisms
        else:
            raise ValueError("'possiblyOrthologusOrganisms' must be of type Iterable or KEGG.Organism.Group")
            
    
[docs]    def getEnzymes(self, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, returnMatches = False, ignoreDuplicatesOutsideSelf = False):
        """
        Get gene-duplicated enzymes.
        
        Parameters
        ----------
        enzymes : Set[Enzyme] or SubstanceEnzymeGraph
            Set of enzymes to be checked for gene duplication, or a graph.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        returnMatches : bool, optional
            If *True*, return not only `enzymes` that have homologs, but also which homologs they have. Useful for filtering for relevant homologs afterwards.
        ignoreDuplicatesOutsideSelf : bool, optional
            If *True*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside the set of enzymes searched for duplicates.
            This can, for example, serve to exclude duplicates in secondary metabolism.
        
        Returns
        -------
        Set[Enzyme] or Dict[Element, Set[GeneID]]
            If returnMatches == *False*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition.
            If returnMatches == *True*, all enzymes in `enzymes` which fulfil the conditions of this gene duplication definition, pointing to a set of gene IDs of the found homologs.
        
        Raises
        ------
        ValueError
            If any organism does not exist.
        HTTPError
            If any gene does not exist.
        URLError
            If connection to KEGG fails.
        """
        # graph -> set
        if isinstance(enzymes, SubstanceEnzymeGraph):
            enzymes = enzymes.getEnzymes()
        
        # get paralogs first
        possibleGeneDuplicates = SimpleGeneDuplication.getEnzymes(enzymes, eValue, returnMatches = returnMatches or ignoreDuplicatesOutsideSelf, ignoreDuplicatesOutsideSet = {enzyme.geneID for enzyme in enzymes}) # always need returnMatches if ignoreDuplicatesOutsideSelf is True! 
        
        if returnMatches or ignoreDuplicatesOutsideSelf:
            possibleGeneDuplicatesSet = possibleGeneDuplicates.keys()
        else:
            possibleGeneDuplicatesSet = possibleGeneDuplicates
        
        if len( possibleGeneDuplicatesSet ) == 0: # nothing to do, because there are no paralogs
            return possibleGeneDuplicates
        
        duplicatedEnzymes = dict()
        
        # get orthologs
        geneIDs = {enzyme.geneID for enzyme in possibleGeneDuplicatesSet}
        
        if returnMatches or ignoreDuplicatesOutsideSelf: # need to get all orthologs
            
            matchingsDict = Database.getOrthologsBulk(geneIDs, self.possiblyOrthologousOrganisms, eValue) # GeneID -> List[ Matching ]
            
            for enzyme in possibleGeneDuplicatesSet:
                
                # add orthologs
                orthologousMatchings = matchingsDict.get(enzyme.geneID)
                if orthologousMatchings is not None:

                    if len(orthologousMatchings) > 0:
                        
                        matches = []
                        for matching in orthologousMatchings:
                            matches.extend(matching.matches)
                            
                        duplicatedEnzymes[enzyme] = {match.foundGeneID for match in matches}
                
                # add paralogs
                paralogousGeneIDs = possibleGeneDuplicates.get(enzyme.geneID)
                if paralogousGeneIDs is not None:
                    
                    if len(paralogousGeneIDs) > 0:
                        
                        currentDuplicates = duplicatedEnzymes.get(enzyme)
                        if currentDuplicates is None:
                            currentDuplicates = set()
                            duplicatedEnzymes[enzyme] = currentDuplicates
                        
                        currentDuplicates.update(paralogousGeneIDs)
            
        else: # only interesting IF there are orthologs
            
            orthologousOrganismsDict = Database.hasOrthologsBulk(geneIDs, self.possiblyOrthologousOrganisms, eValue) # GeneID -> List[ organismAbbreviation ]
            
            for enzyme in possibleGeneDuplicatesSet:
            
                orthologousOrganisms = orthologousOrganismsDict.get(enzyme.geneID)
                if orthologousOrganisms is None:
                    continue
                if len(orthologousOrganisms) > 0:
                    duplicatedEnzymes[enzyme] = None #orthologousOrganisms
        
        if ignoreDuplicatesOutsideSelf:
            filteredDuplicatedEnzymes = dict()
            
            for enzyme, matchGeneIDs in duplicatedEnzymes.items():
                
                matchesInSearchedSet = geneIDs.intersection( matchGeneIDs )
                
                if len(matchesInSearchedSet) > 0: # some of the matches are in the set of enzymes to be checked for duplicates 
                    filteredDuplicatedEnzymes[enzyme] = matchesInSearchedSet
            
            duplicatedEnzymes = filteredDuplicatedEnzymes        
        
        if returnMatches:
            return duplicatedEnzymes
        else:
            return set(duplicatedEnzymes.keys())
    
[docs]    def getEnzymePairs(self, enzymes: 'Set[Enzyme] or SubstanceEnzymeGraph', eValue = defaultEValue, ignoreDuplicatesOutsideSelf = False, geneIdToEnzyme = None) -> Set[Tuple[Enzyme, Enzyme]]:
        """
        Get gene-duplicated enzymes, in pairs of duplicates.
        
        If enzyme A is a duplicate of enzyme B and vice versa, this does not return duplicates, but returns only one pair, with the "smaller" enzyme as the first value. An enzyme is "smaller" if its gene ID string is "smaller".
        
        Parameters
        ----------
        enzymes : Set[Enzyme] or SubstanceEnzymeGraph
            Set of enzymes to be checked for gene duplication, or a graph.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        ignoreDuplicatesOutsideSelf : bool, optional
            If *True*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside the set of enzymes searched for duplicates.
            This can, for example, serve to exclude duplicates in secondary metabolism.
        geneIdToEnzyme : Dict[GeneID, Enzyme], optional
            Dictionary for mapping each gene ID of every found duplicate to an enzyme object.
            If *None*, gets the enzyme from the database. This avoids the KeyError, but can cause a lot of network load.
        
        Returns
        -------
        Set[Tuple[Enzyme, Enzyme]]
            Set of pairs of gene-duplicated enzymes, realised as tuples. The order is arbitrary.
        
        Raises
        ------
        KeyError
            If `geneIdToEnzyme` is passed, but does not contain the gene ID of every duplicate.
        """
        duplicatedEnzymeMatches = self.getEnzymes(enzymes, eValue, returnMatches = True, ignoreDuplicatesOutsideSelf = ignoreDuplicatesOutsideSelf)
        
        # expand matches of homologous gene IDs to pairs of duplicated enzymes. Which we can do (without wasting further resources) only here, because only here we have the geneID -> enzyme dict!
        duplicatedEnzymePairs = set()
        
        
        if geneIdToEnzyme is None: # need to get enzyme objects from database
            allGeneIDs = set()
            for geneIDs in duplicatedEnzymeMatches.values():
                allGeneIDs.update( geneIDs )
            
            geneIdToEnzyme = dict()
            
            for geneID, gene in Database.getGeneBulk(allGeneIDs).items():
                enzyme = Enzyme.fromGene(gene)
                geneIdToEnzyme[geneID] = enzyme
            
        
        for enzymeA, geneIDs in duplicatedEnzymeMatches.items():
            
            for geneID in geneIDs:
                
                enzymeB = geneIdToEnzyme[geneID]
                
                duplicatedEnzymePairs.add( (enzymeA, enzymeB) )
        
        # filter symmetric duplicates
        deduplicatedEnzymePairs = set()
        for enzymeA, enzymeB in duplicatedEnzymePairs:
            if enzymeA <= enzymeB:
                deduplicatedEnzymePairs.add( (enzymeA, enzymeB) )
            else:
                deduplicatedEnzymePairs.add( (enzymeB, enzymeA) )
        
        return deduplicatedEnzymePairs
    
[docs]    def filterEnzymes(self, substanceEnzymeGraph: SubstanceEnzymeGraph, eValue = defaultEValue, ignoreDuplicatesOutsideSelf = False) -> SubstanceEnzymeGraph:
        """
        Remove all enzymes from a graph which have not been gene-duplicated.
        
        Parameters
        ----------
        substanceEnzymeGraph : SubstanceEnzymeGraph
            Graph of enzymes to be checked for gene duplication.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        ignoreDuplicatesOutsideSelf : bool, optional
            If *True*, count only such enzymes as gene duplicated, which have at least one of their duplicates inside the set of enzymes searched for duplicates.
            This can, for example, serve to exclude duplicates in secondary metabolism.
        
        Returns
        -------
        SubstanceEnzymeGraph
            A copy of the `substanceEnzymeGraph` containing only enzymes which fulfil the conditions of this gene duplication definition.
        
        Raises
        ------
        ValueError
            If any organism does not exist.
        HTTPError
            If any gene does not exist.
        URLError
            If connection to KEGG fails.
        """
        graph = substanceEnzymeGraph.copy()
        possibleGeneDuplicates = self.getEnzymes(substanceEnzymeGraph, eValue, returnMatches = False, ignoreDuplicatesOutsideSelf = ignoreDuplicatesOutsideSelf)
        graph.removeAllEnzymesExcept( possibleGeneDuplicates )
        return graph








[docs]class Neofunctionalisation():
    
    def __init__(self, enzymeA: Enzyme, enzymeB: Enzyme):
        """
        Evolutionary event of Neofunctionalisation between a pair of enzymes.
        
        The conditions for a neofunctionalisation are:
            - The enzyme's gene has been duplicated, according to a certain class of GeneDuplication.
            - The duplicated enzyme is associated with a different EC number than its duplicate.
        
        The order of the two enzymes has no meaning, it has been arbitrarily chosen to reflect the lexicographic order of their associated EC numbers. The enzyme posessing the "smallest" EC number comes first.
        This absolute ordering prevents duplicate events, because without an order there would have always been a second event with the exact same enzymes, but in swapped positions, because neofunctionalisation has no direction here and is, thus, symmetric.
        
        Parameters
        ----------
        enzymeA : Enzyme
            An enzyme, which is a gene duplicate of `enzymeB`. The order is arbitrary.
        enzymeB : Enzyme
            An enzyme, which is a gene duplicate of `enzymeA`. The order is arbitrary.
        
        Attributes
        ----------
        self.enzymePair : Tuple[Enzyme, Enzyme]
            Tuple of the two enzymes, sorted by the lexicographic order of their "smallest" EC number.
        
        Raises
        ------
        ValueError
            If the enzymes are equal, have the same set of EC numbers, or one has no EC number.
        """
        if enzymeA == enzymeB:
            raise ValueError('The enzymes must be unequal!')
        
        ecNumbersAset = enzymeA.ecNumbers
        ecNumbersBset = enzymeB.ecNumbers
        
        if ecNumbersAset == ecNumbersBset:
            raise ValueError('The enzymes must have differing EC numbers to be NEOfunctionalised!')
        
        if len(ecNumbersAset) == 0 or len(ecNumbersBset) == 0:
            raise ValueError('The enzymes have to be associated with  at least one EC number!')
        
        ecNumbersA = list(ecNumbersAset)
        ecNumbersA.sort()
        
        ecNumbersB = list(ecNumbersBset)
        ecNumbersB.sort()
        
        if len(ecNumbersA) <= len(ecNumbersB):
            smallerSet = ecNumbersA
            biggerSet = ecNumbersB
            isAsmaller = True
            
        else:
            smallerSet = ecNumbersB
            biggerSet = ecNumbersA
            isAsmaller = False
        
        for index, ec1 in enumerate(smallerSet):
            ec2 = biggerSet[index]
            
            if ec1 == ec2:
                if index == len(smallerSet) - 1: # end of smaller set reached
                    if isAsmaller:
                        self.enzymePair = (enzymeA, enzymeB)
                    else:
                        self.enzymePair = (enzymeB, enzymeA)
                
                else: # not yet at the end, compare next indizes
                    continue
            
            elif ec1 < ec2:
                if isAsmaller:
                    self.enzymePair = (enzymeA, enzymeB)
                else:
                    self.enzymePair = (enzymeB, enzymeA)
            
            else:
                if isAsmaller:
                    self.enzymePair = (enzymeB, enzymeA)
                else:
                    self.enzymePair = (enzymeA, enzymeB)
    
[docs]    def getEnzymes(self) -> Tuple[Enzyme, Enzyme]:
        """
        Get the pair of enzymes.
        
        Returns
        -------
        Tuple[Enzyme, Enzyme]
        """
        return self.enzymePair
    
[docs]    def getEcNumbers(self) -> Tuple[Set[EcNumber], Set[EcNumber]]:
        """
        Get the enzymes' EC numbers.
        
        Returns
        -------
        Tuple[Set[EcNumber], Set[EcNumber]]
            Same order as in :func:`getEnzymes`.
            Because an enzyme could have multiple EC numbers, they are given as sets.
        """
        return (self.enzymePair[0].ecNumbers, self.enzymePair[1].ecNumbers)
    
[docs]    def getDifferingEcLevels(self) -> int:
        """
        Get the maximum number of EC levels in which the enzymes' EC numbers differ.
        
        Returns
        -------
        int
            Number of differing EC levels between the two enzymes' EC numbers, starting with the substrate-level.
            If an enzyme has multiple EC numbers, returns the biggest difference.
            For example 1.2.3.4 and 1.2.3.7 returns 1, while 1.2.3.4 and 1.8.9.10 returns 3.
            However, wildcards do **not** match numbers: 1.2.3.4 and 1.2.3.- returns 1!
        """
        biggestDifference = 0
        
        for ecA in self.getEcNumbers()[0]:
            
            for ecB in self.getEcNumbers()[1]:
                
                difference = 4 - ecA.matchingLevels(ecB, wildcardMatchesNumber = False)
                
                if difference > biggestDifference:
                    biggestDifference = difference
        
        return biggestDifference
    
[docs]    def isSameEcReaction(self) -> bool:
        """
        Whether the enzymes' EC numbers describe a different reaction, or merely a different substrate
        
        Returns
        -------
        bool
            *True*, if the biggest difference in EC levels of the two enzymes is still on the fourth (substrate) level.
        """
        if self.getDifferingEcLevels() > 1:
            return False
        else:
            return True
    
[docs]    def toHtml(self, short = False):
        """
        Get the string representation as an HTML line.
        """
        enzymePair = self.enzymePair
        ecPair = self.getEcNumbers()
        return '<td>' + enzymePair[0].toHtml() + '<td>' + ',&nbsp;'.join([ec.toHtml(short) for ec in sorted(ecPair[0])]) + '</td></td><td><-></td><td>'+ enzymePair[1].toHtml() + '<td>' + ',&nbsp;'.join([ec.toHtml(short) for ec in sorted(ecPair[1])]) + '</td></td>'
    
    def __str__(self):
        enzymePair = self.enzymePair
        ecPair = self.getEcNumbers()
        return '(' + str(enzymePair[0]) + ' [' + ', '.join([str(ec) for ec in sorted(ecPair[0])]) + '],\t'+ str(enzymePair[1]) + ' [' + ', '.join([str(ec) for ec in sorted(ecPair[1])]) + '])'
    
    def __repr__(self):
        return self.__str__()
        
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.enzymePair == other.enzymePair
        return False
        
    def __ne__(self, other):
        return not self == other
    
    def __hash__(self):
        return self.enzymePair.__hash__()
    
    def __lt__(self, other):
        
        # sort by EC number first
        selfEnzyme1 = self.enzymePair[0]
        selfEnzyme2 = self.enzymePair[1]
        
        selfEnzyme1EcList = list(selfEnzyme1.ecNumbers)
        selfEnzyme2EcList = list(selfEnzyme2.ecNumbers)
        
        otherEnzyme1 = other.enzymePair[0]
        otherEnzyme2 = other.enzymePair[1]
        
        otherEnzyme1EcList = list(otherEnzyme1.ecNumbers)
        otherEnzyme2EcList = list(otherEnzyme2.ecNumbers)

        if selfEnzyme1EcList == otherEnzyme1EcList:
            
            if selfEnzyme2EcList == otherEnzyme2EcList:
                
                # then by gene ID
                if selfEnzyme1.uniqueID == otherEnzyme1.uniqueID:
                    return selfEnzyme2.uniqueID < otherEnzyme2.uniqueID
                
                else:
                    return selfEnzyme1.uniqueID < otherEnzyme1.uniqueID
                
            else:
                return selfEnzyme2EcList < otherEnzyme2EcList

        else:
            return selfEnzyme1EcList < otherEnzyme1EcList
    
    def __gt__(self, other):
        
        # sort by EC number first
        selfEnzyme1 = self.enzymePair[0]
        selfEnzyme2 = self.enzymePair[1]
        
        selfEnzyme1EcList = list(selfEnzyme1.ecNumbers)
        selfEnzyme2EcList = list(selfEnzyme2.ecNumbers)
        
        otherEnzyme1 = other.enzymePair[0]
        otherEnzyme2 = other.enzymePair[1]
        
        otherEnzyme1EcList = list(otherEnzyme1.ecNumbers)
        otherEnzyme2EcList = list(otherEnzyme2.ecNumbers)

        if selfEnzyme1EcList == otherEnzyme1EcList:
            
            if selfEnzyme2EcList == otherEnzyme2EcList:
                
                # then by gene ID
                if selfEnzyme1.uniqueID == otherEnzyme1.uniqueID:
                    return selfEnzyme2.uniqueID > otherEnzyme2.uniqueID
                
                else:
                    return selfEnzyme1.uniqueID > otherEnzyme1.uniqueID
                
            else:
                return selfEnzyme2EcList > otherEnzyme2EcList

        else:
            return selfEnzyme1EcList > otherEnzyme1EcList
    
    def __le__(self, other):
        
        # sort by EC number first
        selfEnzyme1 = self.enzymePair[0]
        selfEnzyme2 = self.enzymePair[1]
        
        selfEnzyme1EcList = list(selfEnzyme1.ecNumbers)
        selfEnzyme2EcList = list(selfEnzyme2.ecNumbers)
        
        otherEnzyme1 = other.enzymePair[0]
        otherEnzyme2 = other.enzymePair[1]
        
        otherEnzyme1EcList = list(otherEnzyme1.ecNumbers)
        otherEnzyme2EcList = list(otherEnzyme2.ecNumbers)

        if selfEnzyme1EcList == otherEnzyme1EcList:
            
            if selfEnzyme2EcList == otherEnzyme2EcList:
                
                # then by gene ID
                if selfEnzyme1.uniqueID == otherEnzyme1.uniqueID:
                    return selfEnzyme2.uniqueID <= otherEnzyme2.uniqueID
                
                else:
                    return selfEnzyme1.uniqueID <= otherEnzyme1.uniqueID
                
            else:
                return selfEnzyme2EcList <= otherEnzyme2EcList

        else:
            return selfEnzyme1EcList <= otherEnzyme1EcList
    
    def __ge__(self, other):
        
        # sort by EC number first
        selfEnzyme1 = self.enzymePair[0]
        selfEnzyme2 = self.enzymePair[1]
        
        selfEnzyme1EcList = list(selfEnzyme1.ecNumbers)
        selfEnzyme2EcList = list(selfEnzyme2.ecNumbers)
        
        otherEnzyme1 = other.enzymePair[0]
        otherEnzyme2 = other.enzymePair[1]
        
        otherEnzyme1EcList = list(otherEnzyme1.ecNumbers)
        otherEnzyme2EcList = list(otherEnzyme2.ecNumbers)

        if selfEnzyme1EcList == otherEnzyme1EcList:
            
            if selfEnzyme2EcList == otherEnzyme2EcList:
                
                # then by gene ID
                if selfEnzyme1.uniqueID == otherEnzyme1.uniqueID:
                    return selfEnzyme2.uniqueID >= otherEnzyme2.uniqueID
                
                else:
                    return selfEnzyme1.uniqueID >= otherEnzyme1.uniqueID
                
            else:
                return selfEnzyme2EcList >= otherEnzyme2EcList

        else:
            return selfEnzyme1EcList >= otherEnzyme1EcList




[docs]class NeofunctionalisedEnzymes():
    
    def __init__(self, enzymes: Set[Enzyme], geneDuplicationModel: GeneDuplication, eValue = defaultEValue, ignoreDuplicatesOutsideSet: bool = True):
        """
        Neofunctionalisation events among certain `enzymes`.
        
        Parameters
        ----------
        enzymes : Set[Enzyme]
            Enzymes among which to test for neofunctionalisation. Neofunctionalisations involving enzymes outside this set are **not** reported.
        geneDuplicationModel : GeneDuplication
            The model of gene duplication to use.
        eValue : float, optional
            Threshold for the statistical expectation value (E-value), below which a sequence alignment is considered significant.
        ignoreDuplicatesOutsideSet : bool, optional
            If *True*, any neofunctionalisation involving an enzyme outside the `enzymes` set is not reported.
            This helps to exclude secondary metabolism when examining core metabolism.
        
        Raises
        ------
        ValueError
            If a gene duplication model is used which requires instantiation, but only its class was given.
        """
        if geneDuplicationModel == ChevronGeneDuplication:
            raise ValueError("Chevron gene duplication model requires you to instantiate an object, parametrised with the set of possibly orthologous organisms.")
        
        elif geneDuplicationModel == SimpleGroupGeneDuplication:
            raise ValueError("Simple group gene duplication model requires you to instantiate an object, parametrised with the set of organisms belonging to the same group.")
        
        self._geneDuplicationModel = geneDuplicationModel
        
        self._neofunctionalisations = set()
        
        # get possibly gene-duplicated enzymes, pointing to their duplicates
        geneIdToEnzyme = dict()
        for enzyme in enzymes:
            geneIdToEnzyme[enzyme.geneID] = enzyme
        
        if ignoreDuplicatesOutsideSet is True: # restrict allowed gene duplicates to the set of passed enzymes. This helps to avoid parts of the metabolism outside the core metabolism
            
            if isinstance(geneDuplicationModel, ChevronGeneDuplication):
                duplicatedEnzymePairs = geneDuplicationModel.getEnzymePairs(enzymes, eValue, ignoreDuplicatesOutsideSelf = ignoreDuplicatesOutsideSet, geneIdToEnzyme = geneIdToEnzyme)
            else:
                duplicatedEnzymePairs = geneDuplicationModel.getEnzymePairs(enzymes, eValue, ignoreDuplicatesOutsideSet = set(geneIdToEnzyme.keys()), geneIdToEnzyme = geneIdToEnzyme)
        
        else: # you want to check for neofunctionalisation outside the passed enzymes (usually core metabolism)
            duplicatedEnzymePairs = geneDuplicationModel.getEnzymePairs(enzymes, eValue)
        
        # in all pairs of duplicated enzymes, find neofunctionalised ones
        for enzymePair in duplicatedEnzymePairs:
            try:
                neofunctionalisation = Neofunctionalisation(enzymePair[0], enzymePair[1])
                self._neofunctionalisations.add( neofunctionalisation )
            
            except ValueError: # obviously not neofunctionalised
                pass # ignore
    
    
[docs]    def getNeofunctionalisations(self, minimumEcDifference: int = None) -> Set[Neofunctionalisation]:
        """
        Get neofunctionalisation events between two enzymes each.
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        
        Returns
        -------
        Set[Neofunctionalisation]
            Set of possible neofunctionalisation events.
        """
        if minimumEcDifference is not None and minimumEcDifference > 1:
            filteredNeofunctionalisations = set()
            
            for neofunctionalisation in self._neofunctionalisations:
                if neofunctionalisation.getDifferingEcLevels() >= minimumEcDifference:
                    filteredNeofunctionalisations.add( neofunctionalisation )
            
            return filteredNeofunctionalisations
            
        else:
            return self._neofunctionalisations.copy()            
        
[docs]    def getEnzymes(self, minimumEcDifference: int = None) -> Set[Enzyme]:
        """
        Get all neofunctionalised enzymes.
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        
        Returns
        -------
        Set[Enzyme]
            Set of all possibly neofunctionalised enzymes, regardless of the real direction of neofunctionalisation, which we can not determine here.
        """
        neofunctionalisedEnzymes = set()
        
        for neofunctionalisation in self.getNeofunctionalisations(minimumEcDifference):
            
            neofunctionalisedEnzymes.update( neofunctionalisation.getEnzymes() )
        
        return neofunctionalisedEnzymes
    
[docs]    def filterGraph(self, enzymeGraph: SubstanceEnzymeGraph, minimumEcDifference: int = None) -> SubstanceEnzymeGraph:
        """
        Filter enzyme graph to only contain neofunctionalised enzymes.
        
        Parameters
        ----------
        enzymeGraph : SubstanceEnzymeGraph
            The enzyme graph to filter.
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        
        Returns
        -------
        SubstanceEnzymeGraph
            A copy of `enzymeGraph`, leaving only edges with a neofunctionalised enzyme as key.
        """
        graph = enzymeGraph.copy()
        neofunctionalisedEnzymes = self.getEnzymes(minimumEcDifference)
        graph.removeAllEnzymesExcept( neofunctionalisedEnzymes )
        return graph
    
[docs]    def colourGraph(self, enzymeGraph: SubstanceEnzymeGraph, colour: Export.Colour = Export.Colour.GREEN, minimumEcDifference: int = None) -> SubstanceEnzymeGraph:
        """
        Colour enzyme graph's neofunctionalised enzyme edges.
        
        Parameters
        ----------
        enzymeGraph : SubstanceEnzymeGraph
            The enzyme graph to colour.
        colour : Export.Colour, optional
            The colour to use for edges with neofunctionalised enzymes as key.
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        
        Returns
        -------
        SubstanceEnzymeGraph
            A copy of `enzymeGraph` in which edges with neofunctionalised enzymes as key have an additional colour attribute, see :func:`FEV_KEGG.Drawing.Export.addColourAttribute`.
        """
        graph = enzymeGraph.copy()
        neofunctionalisedEnzymes = self.getEnzymes(minimumEcDifference)
        Export.addColourAttribute(graph, colour, nodes = False, edges = neofunctionalisedEnzymes)
        return graph




[docs]class FunctionChange():
    
    def __init__(self, ecA: EcNumber, ecB: EcNumber):
        """
        Possible evolutionary change of enzymatic function, from one EC number to another.
        
        The direction of change, or if it really happened, can not be determined here!
        The order of EC numbers in this object is arbitrarily chosen to reflect their lexicographic order.
        
        A function change resembles the possibility that the first EC number has evolutionarily changed into the second one. Or the other way around, since the direction of evolution can not be determined here.
        A function change can never have the same EC number twice, nor can the first be lexicographically "bigger" than the second, see the examples section.
        
        Parameters
        ----------
        ecA: EcNumber
        ecB: EcNumber
            Must be lexicographically "bigger" than `ecA`. Can not be equal to `ecA`.
        
        Raises
        ------
        ValueError
            If `ecA` is lexicographically "bigger" than, or equal to, `ecB`.
        """
        if ecA >= ecB:
            raise ValueError("EC number 1 is bigger than or equal to EC number 2.")
        
        self.ecA = ecA
        self.ecB = ecB
        self.ecPair = (ecA, ecB)
    
[docs]    @classmethod
    def fromNeofunctionalisation(cls, neofunctionalisation: Neofunctionalisation) -> Set['FunctionChange']:
        """
        Create combinations of function changes from a `neofunctionalisation`.
        
        Parameters
        ----------
        neofunctionalisation : Neofunctionalisation
        
        Returns
        -------
        Set[FunctionChange]
            Set of function changes which might have been caused by the `neofunctionalisation`.
            
            Since an enzyme of a neofunctionalisation can have multiple EC numbers, all combinations of the two enzymes' EC numbers are formed and treated as separate possible function changes.
        
        Examples
        --------
        A: 1
        B: 2
        = (1, 2)
        
        A: 1
        B: 1, 2
        = (1, 2)
        
        A: 1, 2
        B: 1, 3
        = (1, 3) (1, 2) (2, 3)
        
        A: 1, 2
        B: 3, 4
        = (1, 3) (1, 4) (2, 3) (2, 4)
        
        A: 1, 2
        B: 1, 2, 3
        = (1, 3) (2, 3)
        """
        ecNumbersA, ecNumbersB = neofunctionalisation.getEcNumbers()
            
        # cross product
        questionableEcPairs = itertools.product(ecNumbersA, ecNumbersB)
        
        # filter illegal products
        functionChanges = set()
        for pair in questionableEcPairs:
            try:
                functionChanges.add(cls(pair[0], pair[1]))
            except ValueError:
                pass
        
        return functionChanges
    
[docs]    def getDifferingEcLevels(self) -> int:
        """
        Get the number of EC levels in which the EC numbers differ.
        
        Returns
        -------
        int
            Number of differing EC levels between the two EC numbers, starting with the substrate-level.
            For example 1.2.3.4 and 1.2.3.7 returns 1, while 1.2.3.4 and 1.8.9.10 returns 3.
            However, wildcards do **not** match numbers: 1.2.3.4 and 1.2.3.- returns 1!
        """
        return 4 - self.ecA.matchingLevels(self.ecB, wildcardMatchesNumber = False)
    
[docs]    def toHtml(self, short = False):
        """
        Get the string representation as an HTML line.
        """
        return '<td>' + self.ecPair[0].toHtml(short) + '</td><td><-></td><td>' + self.ecPair[1].toHtml(short) + '</td>'
    
    def __str__(self):
        return self.ecPair.__str__()
    
    def __repr__(self):
        return self.__str__()
        
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.ecPair == other.ecPair
        return False
        
    def __ne__(self, other):
        return not self == other
    
    def __hash__(self):
        return self.ecPair.__hash__()
    
    def __lt__(self, other):
        return self.ecPair < other.ecPair
    
    def __gt__(self, other):
        return self.ecPair > other.ecPair
    
    def __le__(self, other):
        return self.ecPair <= other.ecPair
    
    def __ge__(self, other):
        return self.ecPair >= other.ecPair




[docs]class NeofunctionalisedECs():
    
    def __init__(self, neofunctionalisedEnzymes: NeofunctionalisedEnzymes):
        """
        EC numbers which are affected by neofunctionalisation events.
        
        Parameters
        ----------
        neofunctionalisedEnzymes : NeofunctionalisedEnzymes
            Neofunctionalisation events among certain enzymes.
        """
        self._neofunctionalisedEnzymes = neofunctionalisedEnzymes
    
    
[docs]    def getNeofunctionalisationsForFunctionChange(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Dict[FunctionChange, Set[Neofunctionalisation]]:
        """
        Get neofunctionalsation events, keyed by a change of function between the two enzymes.        
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        minimumOrganismsCount : int, optional
            Minimum number of organisms which have to be involved in the neofunctionalisations of each function change.
            If *None*, there is no filtering due to organism involvement.
            For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc), finally, if `minimumOrganismsCount` <= 3, the function change 1->2 is returned.
        
        Returns
        -------
        Dict[FunctionChange, Set[Neofunctionalisation]]
            Dictionary of function changes, pointing to a set of neofunctionalisations which might have caused them.
            
            Since an enzyme of a neofunctionalisation can have multiple EC numbers, all combinations of the two enzymes' EC numbers are formed and treated as separate possible function changes.
            The neofunctionalisation is then saved again for each function change, which obviously leads to duplicated neofunctionalisation objects.
        
        Examples
        --------
        A: 1
        B: 2
        = (1, 2)
        
        A: 1
        B: 1, 2
        = (1, 2)
        
        A: 1, 2
        B: 1, 3
        = (1, 3) (1, 2) (2, 3)
        
        A: 1, 2
        B: 3, 4
        = (1, 3) (1, 4) (2, 3) (2, 4)
        
        A: 1, 2
        B: 1, 2, 3
        = (1, 3) (2, 3)
        """
        neofunctionalisationForFunctionChange = dict()
        
        # get neofunctionalisations
        for neofunctionalisation in self._neofunctionalisedEnzymes.getNeofunctionalisations(minimumEcDifference):
            
            # split sets of EC numbers into pair-wise combinations
            functionChanges = FunctionChange.fromNeofunctionalisation(neofunctionalisation)
            
            for functionChange in functionChanges:
                currentSet = neofunctionalisationForFunctionChange.get(functionChange, None)
                
                if currentSet is None:
                    currentSet = set()
                    neofunctionalisationForFunctionChange[functionChange] = currentSet
                
                currentSet.add(neofunctionalisation)
        
        # filter function changes with neofunctionalised enzymes which stem from too few organisms
        if minimumOrganismsCount is not None:
            keysToDelete = []
            
            for functionChange, neofunctionalisations in neofunctionalisationForFunctionChange.items():
                
                enzymes = set()
                
                for neofunctionalisation in neofunctionalisations:    
                    enzymes.update( neofunctionalisation.getEnzymes() )
                
                organisms = set()
                
                for enzyme in enzymes:
                    organisms.add( enzyme.organismAbbreviation )
                
                # enough occuring organisms?
                if not len(organisms) >= minimumOrganismsCount:
                    keysToDelete.append( functionChange )
            
            # delete keys which failed the test
            for key in keysToDelete:
                del neofunctionalisationForFunctionChange[key]
        
        return neofunctionalisationForFunctionChange
    
[docs]    def getFunctionChanges(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Set[FunctionChange]:
        """
        Get all possible changes of function between the two enzymes of every neofunctionalisation.
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        minimumOrganismsCount : int, optional
            Minimum number of organisms which have to be involved in the neofunctionalisations of each function change.
            If *None*, there is no filtering due to organism involvement.
        
        Returns
        -------
        Set[FunctionChange]
            Set of all function changes, which meet the criteria.
        """
        return set( self.getNeofunctionalisationsForFunctionChange(minimumEcDifference, minimumOrganismsCount).keys() )
    
[docs]    def getEnzymesForFunctionChange(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Dict[FunctionChange, Set[Enzyme]]:
        """
        Get enzymes of neofunctionalisations, keyed by a possible change of function.
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        minimumOrganismsCount : int, optional
            Minimum number of organisms which have to be involved in the neofunctionalisations of each function change.
            If *None*, there is no filtering due to organism involvement.
            For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc), finally, if `minimumOrganismsCount` <= 3, the function change 1->2 is returned.
        
        Returns
        -------
        Dict[FunctionChange, Set[Enzyme]]
            Dictionary of function changes, pointing to a set of enzymes involved in the neofunctionalisations which might have caused the function change.
            This can lead to many duplicated enzymes.
        """
        enzymesForFunctionChange = dict()
        
        for functionChange, neofunctionalisations in self.getNeofunctionalisationsForFunctionChange(minimumEcDifference, minimumOrganismsCount).items():
            currentSet = enzymesForFunctionChange.get(functionChange, None)
                
            if currentSet is None:
                currentSet = set()
                enzymesForFunctionChange[functionChange] = currentSet
            
            for neofunctionalisation in neofunctionalisations:
                currentSet.update( neofunctionalisation.getEnzymes() )
            
        return enzymesForFunctionChange
    
    
    
[docs]    def getNeofunctionalisationsForEC(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Dict[EcNumber, Set[Neofunctionalisation]]:
        """
        Get neofunctionalisation events, keyed by an EC number participating in the change of function between the two enzymes.
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        minimumOrganismsCount : int, optional
            Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number.
            If *None*, there is no filtering due to organism involvement.
            This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change.
            For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc).
            Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported.
            However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3.
        
        Returns
        -------
        Dict[EcNumber, Set[Neofunctionalisation]]
            Dictionary of EC numbers which are part of function changes, pointing to a set of neofunctionalisations which might have caused them.
            Very likely has duplicated neofunctionalisations, because there are always at least two EC numbers involved in a neofunctionalisation.
        """
        neofunctionalisationForEC = dict()
        
        # get neofunctionalisations
        for neofunctionalisation in self._neofunctionalisedEnzymes.getNeofunctionalisations(minimumEcDifference):
            
            # split sets of EC numbers into pair-wise combinations
            functionChanges = FunctionChange.fromNeofunctionalisation(neofunctionalisation)
            
            for functionChange in functionChanges:
                
                # for each EC number of a function change, save neofunctionalisation
                for ec in functionChange.ecPair:
                    currentSet = neofunctionalisationForEC.get(ec, None)
                    
                    if currentSet is None:
                        currentSet = set()
                        neofunctionalisationForEC[ec] = currentSet
                    
                    currentSet.add(neofunctionalisation)
        
        # filter ECs with neofunctionalised enzymes which stem from too few organisms
        if minimumOrganismsCount is not None:
            keysToDelete = []
            
            for ec, neofunctionalisations in neofunctionalisationForEC.items():
                
                enzymes = set()
                
                for neofunctionalisation in neofunctionalisations:    
                    enzymes.update( neofunctionalisation.getEnzymes() )
                
                organisms = set()
                
                for enzyme in enzymes:
                    organisms.add( enzyme.organismAbbreviation )
                
                # enough occuring organisms?
                if not len(organisms) >= minimumOrganismsCount:
                    keysToDelete.append( ec )
            
            # delete keys which failed the test
            for key in keysToDelete:
                del neofunctionalisationForEC[key]
        
        return neofunctionalisationForEC
    
[docs]    def getECs(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Set[EcNumber]:
        """
        Get EC numbers participating in the change of function due to neofunctionalisations.
        
        They could also be called "neofunctionalised" EC numbers.
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        minimumOrganismsCount : int, optional
            Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number.
            If *None*, there is no filtering due to organism involvement.
            This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change.
            For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc).
            Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported.
            However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3.
        
        Returns
        -------
        Set[EcNumber]
            Set of EC numbers which are part of function changes which possibly happened due to neofunctionalisations.
        """
        return set( self.getNeofunctionalisationsForEC(minimumEcDifference, minimumOrganismsCount).keys() )
    
[docs]    def getEnzymesForEC(self, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> Dict[EcNumber, Set[Enzyme]]:
        """
        Get enzymes of neofunctionalisations, keyed by an EC number of a possible function change.
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        minimumOrganismsCount : int, optional
            Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number.
            If *None*, there is no filtering due to organism involvement.
            This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change.
            For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc).
            Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported.
            However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3.
        
        Returns
        -------
        Dict[EcNumber, Set[Enzyme]]
            Dictionary of EC numbers, pointing to a set of enzymes involved in the neofunctionalisations which might have caused the function changes the EC number is part of.
            This can lead to many duplicated enzymes.
        """
        enzymesForEC = dict()
        
        for ec, neofunctionalisations in self.getNeofunctionalisationsForEC(minimumEcDifference, minimumOrganismsCount).items():
            currentSet = enzymesForEC.get(ec, None)
                
            if currentSet is None:
                currentSet = set()
                enzymesForEC[ec] = currentSet
            
            for neofunctionalisation in neofunctionalisations:
                currentSet.update( neofunctionalisation.getEnzymes() )
            
        return enzymesForEC
    
    
    
[docs]    def filterGraph(self, ecGraph: SubstanceEcGraph, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> SubstanceEcGraph:
        """
        Filter EC graph to only contain "neofunctionalised" EC numbers.
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        minimumOrganismsCount : int, optional
            Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number.
            If *None*, there is no filtering due to organism involvement.
            This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change.
            For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc).
            Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported.
            However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3.
        
        Returns
        -------
        SubstanceEcGraph
            A copy of `ecGraph`, leaving only edges with a "neofunctionalised" EC as key.
        """
        graph = ecGraph.copy()
        neofunctionalisedECs = self.getECs(minimumEcDifference, minimumOrganismsCount)
        graph.removeAllECsExcept( neofunctionalisedECs )
        return graph
    
[docs]    def colourGraph(self, ecGraph: SubstanceEcGraph, colour: Export.Colour = Export.Colour.GREEN, minimumEcDifference: int = None, minimumOrganismsCount: int = None) -> SubstanceEcGraph:
        """
        Colour EC graph's "neofunctionalised" EC number edges.
        
        Parameters
        ----------
        minimumEcDifference : int, optional
            May only be one of [1, 2, 3, 4].
            If *None* or *1*, all neofunctionalisations are returned.
            If > *1*, return only neofunctionalisations in which the EC numbers differ in more than the `minimumEcDifference` lowest levels.
            They then describe a different reaction, instead of only a different substrate.
            For example, `minimumEcDifference` == *2* means that 1.2.3.4/1.2.3.5 is not reported, while 1.2.3.4/1.2.5.6 is.
        minimumOrganismsCount : int, optional
            Minimum number of organisms which have to be involved in the neofunctionalisations of each EC number.
            If *None*, there is no filtering due to organism involvement.
            This sums the occurences of organisms across function changes, for each EC number the function changes overlap with. Hence, it is much less likely that a neofunctionalisation is filtered, compared to filtering per function change.
            For example, the function change 1->2 is associated with two neofunctionalisations 'eco:12345'->'eco:69875' and 'obc:76535'->'abc:41356', this involves three organisms in total (eco, obc, abc).
            Also, the function change 1->3 involves two organisms ('eco:53235'->'iuf:34587'). If `minimumOrganismsCount` == 4, neither 1->2, nor 1->3 are reported.
            However, if we look at single EC numbers, 1 is involved in function changes affecting four organisms (eco, obc, abc, iuf). Thus, 1 would be reported here, but neither 2 nor 3.
        
        Returns
        -------
        SubstanceEcGraph
            A copy of `ecGraph` in which edges with "neofunctionalised" ECs as key have an additional colour attribute, see :func:`FEV_KEGG.Drawing.Export.addColourAttribute`.
        """
        graph = ecGraph.copy()
        neofunctionalisedECs = self.getECs(minimumEcDifference, minimumOrganismsCount)
        Export.addColourAttribute(graph, colour, nodes = False, edges = neofunctionalisedECs)
        return graph