Source code for FEV_KEGG.Graph.Elements

from builtins import str
import re
from typing import List, Iterable

from FEV_KEGG.KEGG.DataTypes import Gene
from FEV_KEGG.Util import Util
from FEV_KEGG import settings


[docs]class Element(object):
    def __init__(self, uniqueID: str):
        """
        Generic graph element with a `uniqueID`.
        
        Comparable (==, !=, <, >, <=, >=) and hashable by this unique ID. Converting to a string returns the `uniqueID`.
        
        Parameters
        ----------
        uniqueID : str
            String uniquely identifying this element among all other possible elements.
        
        Attributes
        ----------
        self.uniqueID : str
            Unique element ID.
        """
        self.uniqueID = uniqueID
    
[docs]    def getUrl(self):
        """
        Get the link to KEGG for this EC number.
        
        Returns
        -------
        str
            URL to KEGG.
        """
        return "http://kegg.jp/dbget-bin/www_bget?" + self.uniqueID
    
[docs]    def getRestUrl(self):
        """
        Get the link to KEGG's REST-API for this EC number.
        
        Essentially the same as :func:`getUrl`, but meant to be read by machines, therefore no eye-candy.
        
        Returns
        -------
        str
            URL to KEGG's REST-API
        """
        return "http://rest.kegg.jp/get/" + self.uniqueID
    
[docs]    def toHtml(self, short = False, noTd = False):
        """
        Get the Element's string representation surrounded by its URL as an HTML line.
        """
        if self.name is None or short is True:
            if noTd is True:
                return "<a target='_blank' href='" + self.getUrl() + "'>" + self.__str__() + "</a>"
            else:
                return "<td><a target='_blank' href='" + self.getUrl() + "'>" + self.__str__() + "</a></td><td></td>"
        else:
            if noTd is True:
                return "<a target='_blank' href='" + self.getUrl() + "'>" + self.__str__() + "</a>&nbsp;(" + self.name + ")"
            else:
                return "<td><a target='_blank' href='" + self.getUrl() + "'>" + self.__str__() + "</a></td><td>(" + self.name + ")</td>"
    
        
    def __str__(self):
        if settings.printElementUrl:
            return str(self.uniqueID) + ' (' + self.getUrl() + ')'
        else:
            return self.uniqueID
    
    def __repr__(self):
        return self.__str__()
        
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.uniqueID == other.uniqueID
        return False
        
    def __ne__(self, other):
        return not self == other
    
    def __hash__(self):
        return self.uniqueID.__hash__()
    
    def __lt__(self, other):
        return self.uniqueID < other.uniqueID
    
    def __gt__(self, other):
        return self.uniqueID > other.uniqueID
    
    def __le__(self, other):
        return self.uniqueID <= other.uniqueID
    
    def __ge__(self, other):
        return self.uniqueID >= other.uniqueID
    


[docs]class DrugIdError(Exception):
    """
    Raised if a :class:`SubstanceID` is created from a drug ID, because only compounds and glycans are useful in our model of metabolism.
    """
    pass

[docs]class SubstanceID(Element):
    REGEX_PATTERN = re.compile('^C|G[0-9]{5}$')
    
    def __init__(self, keggSubstanceID: 'C01102'):
        """
        Represents a substrate/product of metabolism by compound/glycan ID from KEGG, eg. 'C01102' or 'G00160'.
        
        Parameters
        ----------
        keggSubstanceID : str
            Unique ID of the compound or glycan.
        description : str, optional
            Descriptive chemical name of the compound/glycan.
        
        Attributes
        ----------
        self.keggCompoundID : str
            Unique compound/glycan ID.
        self.description : str
            Descriptive chemical name of the compound/glycan. May likely be *None*. Usually a list of synonymous names.
        self.name : str
            Short chemical name of the compound/glycan. May likely be *None*. Is the shortest name occuring in `description`.
        
        Raises
        ------
        DrugIdError
            Drug IDs, eg. D08603, raise a DrugIdError, because only compounds and glycans are useful in our model of metabolism. Use the synonymous Compound ID instead.
        
        Note
        ----
        This does not check if the compound/glycan actually exists in KEGG! You will find out eventually when trying to retrieve information about it.
        
        See Also
        --------
        FEV_KEGG.Graph.SubstanceGraphs.SubstanceGraph.addSubstanceDescriptions : The function to download and add `self.description`, and `self.name`.
        """
        if keggSubstanceID[0] == 'D':
            raise DrugIdError('Drug IDs are not accepted, as there are usually accompanied by a synonymous Compound ID.')
        
        if self.__class__.REGEX_PATTERN.match(keggSubstanceID) is None: # wrong format
            raise ValueError('Compound/Glycan ID not formatted correctly: ' + keggSubstanceID)
        
        Element.__init__(self, keggSubstanceID)
        self.keggCompoundID = self.uniqueID
        self.description = None
        self.name = None




[docs]class ReactionID(Element):
    
    def __init__(self, keggReactionID: 'R01899'):
        """
        Represents a reaction of metabolism by reaction ID from KEGG, eg. 'R01899'.
        
        Parameters
        ----------
        keggReactionID : str
            Unique ID of the reaction.
        
        Attributes
        ----------
        self.keggReactionID : str
            Unique reaction ID.
        
        Note
        ----
        This does not check if the reaction actually exists in KEGG! You will find out eventually when trying to retrieve information about it.
        """
        Element.__init__(self, keggReactionID)
        self.keggReactionID = self.uniqueID



[docs]class Enzyme(Element):

    def __init__(self, organismAbbreviation: 'eco', geneName: 'b0004', ecNumberStrings: List[str], name: 'thrC' = None, description: '(RefSeq) hydrogenase 4, subunit' = None):
        """
        Represents an enzyme of metabolism.
        
        It has exactly one GeneID, which is its unique identifier.
        
        Parameters
        ----------
        organismAbbreviation : str
            Abbreviation string of the organism this enzyme belongs to, as known to KEGG, e.g. 'eco'. Must obviously be unique and existant in KEGG.
        geneName : str
            Name of the gene which represents this enzyme, e.g. 'b0004'. Will be combined with `organismAbbreviation` to form the unique :class:`GeneID`. Thus, must be unique within the organism.
        ecNumberStrings : List[str]
            List of strings representing the EC numbers associated with this enzyme. Will be split and parsed into :class:`EcNumber` objects.
        name : str, optional
            Colloquial name of this enzyme, e.g. 'thrC'. This is not used for automatic identification, you may make it *None*.
        description : str, optional
            Full description of this enzyme from KEGG, e.g. '(RefSeq) hydrogenase 4, subunit'. This is not used for automatic identification, you may make it *None*.
        
        Attributes
        ----------
        self.organismAbbreviation : str
        self.geneName : str
        self.geneID : GeneID
        self.name : str
        self.ecNumbers : Set[EcNumber]
        self.description : str
            
        Raises
        ------
        ValueError
            If `organismAbbreviation` and `geneName` do not form a valid gene ID. Or if any of the EC numbers in `ecNumberStrings` is not a valid EC number.
        
        Note
        ----
        This does not check if the organism, gene ID, EC numbers, or anything else actually exist in KEGG! You will find out eventually when trying to retrieve information about them.
        """
        # build subclasses
        # GeneID
        geneID = GeneID(organismAbbreviation + ':' + geneName)
        # EcNumbers
        ecNumbers = set()
        for ecNumberString in ecNumberStrings:
            ecNumber = EcNumber(ecNumberString)
            ecNumbers.add(ecNumber)
        
        # determine unique ID
        Element.__init__(self, geneID.__str__())
        
        # save object attributes
        self.organismAbbreviation = organismAbbreviation
        self.geneID = geneID
        self.geneName = geneName
        if name is not None and name.__eq__(geneName):
            self.name = None
        else:
            self.name = name
        self.ecNumbers = ecNumbers
        # replace useless substrings
        if description is not None:
            description = description.replace('(RefSeq) ', '')
        self.description = description
    
[docs]    def getEcNumbersString(self):
        """
        EC numbers associated with this enzyme as a string.
        
        Returns
        -------
        str
            EC numbers associated with this enzyme in a string, eg. '1.2.3.4, 2.3.4.5'
        """
        strings = []
        for ecNumber in self.ecNumbers:
            strings.append(ecNumber.__str__())
            
        return ', '.join(strings)
    
[docs]    @classmethod
    def fromGene(cls, gene: Gene) -> 'Enzyme':
        """
        Creates an :class:`Enzyme` from a :class:`FEV_KEGG.KEGG.DataTypes.Gene`.
        
        Parameters
        ----------
        gene : Gene
            Gene object, retrieved and parsed from KEGG GENE at some point.
        
        Returns
        -------
        Enzyme
            An enzyme object.
        
        Raises
        ------
        ValueError
            If `organismAbbreviation` and `geneName` do not form a valid gene ID. Or if any of the EC numbers in `ecNumberStrings` is not a valid EC number.
        """
        return cls(organismAbbreviation = gene.organismAbbreviation, geneName = gene.number, ecNumberStrings = gene.ecNumbers, name = gene.symbol, description = gene.name)
    
    def __lt__(self, other):
        
        # sort by EC number first
        selfEcList = list(self.ecNumbers)
        otherEcList = list(other.ecNumbers)

        if selfEcList == otherEcList:
            # then by gene ID
            return self.uniqueID < other.uniqueID

        else:
            return selfEcList < otherEcList
    
    def __gt__(self, other):
        
        # sort by EC number first
        selfEcList = list(self.ecNumbers)
        otherEcList = list(other.ecNumbers)

        if selfEcList == otherEcList:
            # then by gene ID
            return self.uniqueID > other.uniqueID

        else:
            return selfEcList > otherEcList
    
    def __le__(self, other):
        
        # sort by EC number first
        selfEcList = list(self.ecNumbers)
        otherEcList = list(other.ecNumbers)

        if selfEcList == otherEcList:
            # then by gene ID
            return self.uniqueID <= other.uniqueID

        else:
            return selfEcList <= otherEcList
    
    def __ge__(self, other):
        
        # sort by EC number first
        selfEcList = list(self.ecNumbers)
        otherEcList = list(other.ecNumbers)

        if selfEcList == otherEcList:
            # then by gene ID
            return self.uniqueID >= other.uniqueID

        else:
            return selfEcList >= otherEcList
    

[docs]class EnzymeComplete(Enzyme):
    
    def __init__(self, gene: Gene):
        """
        Represents an enzyme of metabolism, saving the original underlying gene description `gene` for later manual use.
        
        The underlying gene description is usually not necessary, use the parent class to save memory space.
        
        Parameters
        ----------
        gene : Gene
            Gene object, retrieved and parsed from KEGG GENE at some point. Will be kept in memory in the *gene* attribute.
        
        Attributes
        ----------
        self.gene : :class:`FEV_KEGG.KEGG.DataTypes.Gene`
            Original underlying gene description.
        
        Raises
        ------
        ValueError
            See parent class.
        """
        super().__init__(gene.organismAbbreviation, gene.number, gene.symbol, gene.ecNumbers)
        self.gene = gene
    
    
[docs]class EcNumber(Element):
    WILDCARD = '-'
    REGEX_PATTERN = re.compile('^[1-7]\.(([1-9][0-9]{0,1})|\-)\.(((?<!\-\.)([1-9][0-9]{0,1}))|\-)\.(((?<!\-\.)([1-9][0-9]{0,2}))|\-)$')
    
    def __init__(self, ecNumberString: '4.2.3.1'):
        """
        Represents an enzyme of metabolism by EC number, e.g. '4.2.3.1'.
        
        Parameters
        ----------
        ecNumberString : str
            EC number represented as a string. Will be checked for correct formatting!
        
        
        Attributes
        ----------
        self.ecNumberString : str
            E.g. '4.2.3.-'.
        self.ecNumberLevels : List[str]
            E.g. ['4', '2', '3', '-'].
        self.ecNumberLevelsInteger : List[int]
            E.g. [4, 2, 3, -1]. A wildcard is translated to -1.
        self.description : str
            Descriptive name of the enzymes behind this EC number. May likely be *None*. Usually a list of synonymous names.
        self.name : str
            Short name of the enzymes behind this EC number. May likely be *None*. Is the shortest name occuring in `description`.
        self.reaction : str
            IUBMB string describing the reaction formula. May likely be *None*.
        
        Raises
        ------
        ValueError
            If EC number is not formatted correctly.
        
        See Also
        --------
        FEV_KEGG.Graph.SubstanceGraphs.SubstanceEcGraph.addEcDescriptions : The function to download and add `self.description`, `self.name`, and `self.reaction`.
        """
        if self.__class__.REGEX_PATTERN.match(ecNumberString) is None: # wrong format
            raise ValueError('EC number not formatted correctly: ' + ecNumberString)
        
        # determine unique ID
        Element.__init__(self, ecNumberString)
        
        # save object attributes
        self.ecNumberString = self.uniqueID
        self.ecNumberLevels = self.ecNumberString.split('.')
        self._ecNumberLevelsInteger = [-1 if level == EcNumber.WILDCARD else int(level) for level in self.ecNumberLevels]
        self.description = None
        self.name = None
        self.reaction = None
    
[docs]    @classmethod
    def fromArray(cls, ecNumberLevels: Iterable) -> 'EcNumber':
        """
        Creates EcNumber object from single EC number levels.
        
        Parameters
        ----------
        ecNumberLevels : Iterable
            Iterable of the EC number levels, can be int or str. For a wildcard, obviously only str is reasonable.
        
        Raises
        ------
        ValueError
            If the resulting EC number is not formatted correctly.
        """
        return cls('.'.join(ecNumberLevels))
    
    @property
    def ecNumberLevelsInteger(self) -> List[int]:
        if not hasattr(self, '_ecNumberLevelsInteger'):
            self._ecNumberLevelsInteger = [-1 if level == EcNumber.WILDCARD else int(level) for level in self.ecNumberLevels]
        return self._ecNumberLevelsInteger
    
[docs]    def contains(self, ecNumber: 'EcNumber') -> bool:
        """
        Check whether this EC number is a superset of `ecNumber`, made possibly by the wildcard.
        
        Parameters
        ----------
        ecNumber : EcNumber
            The EC number to compare against.
        
        Returns
        -------
        bool
            *True*, if the other EC number is part of the set of EC numbers defined by wildcard dashes in the levels of this EC number.
            For example 1.2.3.- contains 1.2.3.1 up to 1.2.3.999, but 1.2.3.4 can only contain itself.
        """
        selfLevels = self.ecNumberLevels
        otherLevels = ecNumber.ecNumberLevels
        
        for i in range(0, 4):
            selfNumber = selfLevels[i]
            otherNumber = otherLevels[i]
            if selfNumber != EcNumber.WILDCARD and selfNumber != otherNumber: # current level does not match AND is has no wildcard '-' in this EC number
                return False
        
        return True
    
    
[docs]    def matchingLevels(self, ecNumber: 'EcNumber', wildcardMatchesNumber = True) -> int:
        """
        Determines the number of levels which match between this EC number and `ecNumber`.
        
        This could act as a coarse distance measure for EC numbers.
        
        Parameters
        ----------
        ecNumber : EcNumber
            The EC number to compare against.
        wildcardMatchesNumber : bool, optional
            If *True*, a wildcard acts as a sure match: '1.-.-.-'.matchingLevels('1.2.3.4') = 4.
            If *False*, a wildcard only matches another wildcard.
        
        Returns
        -------
        int
            Number of consecutive levels that match, if any, starting with the first (leftmost).
            '1.2.3.4'.matchingLevels('1.2.6.7') = 2 because the first two levels match consecutively.
            '1.2.3.4'.matchingLevels('2.2.3.4') = 0 because the very first level does not match.
        """
        matchingLevels = 0
        
        selfLevels = self.ecNumberLevels
        otherLevels = ecNumber.ecNumberLevels
        
        for i in range(0, 4):
            selfNumber = selfLevels[i]
            otherNumber = otherLevels[i]
            
            if wildcardMatchesNumber == True:
                if selfNumber == EcNumber.WILDCARD or otherNumber == EcNumber.WILDCARD or selfNumber == otherNumber: # current level matches OR is a wildcard
                    matchingLevels += 1
                else:
                    return matchingLevels
            else:
                if selfNumber == otherNumber: # current level matches
                    matchingLevels += 1
                else:
                    return matchingLevels
        
        return matchingLevels
    
[docs]    def hasWildcard(self) -> bool:
        """
        Whether this EC number contains a wildcard.
        
        Returns
        -------
        bool
            *True* if this EC number contains a wildcard (-) at any level, otherwise, returns *False*.
        """
        for level in self.ecNumberLevels:
            if level == EcNumber.WILDCARD:
                return True
        return False
    
[docs]    @staticmethod
    def removeWildcards(ecNumbers: Iterable) -> Iterable:
        """
        Remove EC numbers containing wildcards from an Iterable.
        
        Parameters
        ----------
        ecNumbers : Iterable[EcNumber]
            The EcNumber objects to check for wildcards.
        
        Returns
        -------
        Iterable[EcNumber]
            A new Iterable of the same type, containing only EC numbers which do **not** have a wildcard (-) anywhere. This does not deduplicate EC numbers.
        """
        validECnumbers = []
        for ecNumber in ecNumbers:
            if not ecNumber.hasWildcard():
                validECnumbers.append(ecNumber)
        
        return ecNumbers.__class__(validECnumbers)
    
[docs]    @staticmethod
    def insertWildcards(ecNumbers: Iterable, keepLevels = 3, allowHigherWildcards = True, returnSet = True, deduplicateList = False) -> Iterable:
        """
        Turns EC numbers without wildcards into EC numbers with wildcards. 
        
        Returning them in a list preserves order.
        
        Parameters
        ----------
        ecNumbers : Iterable
            The EcNumber objects to abstract using wildcards.
        keepLevels : int, optional
            The first x levels of each EC number are kept intact. If `keepLevels` == 3, turns 1.2.3.4 into 1.2.3.-. Only 1, 2, 3, and 4 are allowed. EC numbers already containing wildcards are left unchanged.
        allowHigherWildcards : bool, optional
            If *False* and there is a wildcard in a level above 'keepLevels' (e.g. 3):, 1.2.3.4 -> 1.2.3.- and 2.3.4.- -> 2.3.4.-, but 3.4.-.- is removed completely.
        returnSet : bool, optional
            If *True*, returns results in a set. Takes precedence over 'deduplicateList', as sets automatically deduplicate.
        deduplicateList : bool, optional
            If *True*, result list is deduplicated before returning, preserving order.
        
        Returns
        -------
        Iterable
            Either a list or a set of abstracted EC numbers.
        
        Raises
        ------
        ValueError
            If `keepLevels` is not one of [1, 2, 3, 4].
        """
        if not keepLevels in [1, 2, 3, 4]:
            raise ValueError('Can not keep ' + str(keepLevels) + ' levels, there are only 1, 2, 3, or 4.')
        
        filtered = []
        
        for ecNumber in ecNumbers:
            
            levels = ecNumber.ecNumberLevels
            
            filteredLevels = []
            for i in range(0, keepLevels):
                
                level = levels[i]
                
                # check for higher wildcards
                if allowHigherWildcards is False and level == EcNumber.WILDCARD:
                    filteredLevels = None
                    break
                
                else:
                    filteredLevels.append(level)
                
            if filteredLevels is None: # higher wildcard found but disallowed
                continue
            
            else: # pad with wildcards
                for _ in range(4, keepLevels, -1):
                    filteredLevels.append(EcNumber.WILDCARD)
            
            filtered.append( EcNumber.fromArray(filteredLevels) )
        
        if returnSet is True:
            return set( filtered )
            
        if deduplicateList is True:
            filtered = Util.deduplicateList(filtered, preserveOrder = True)
        
        return filtered
    
[docs]    def addDescription(self):
        """
        Query KEGG and add further description to this EC number.
        
        Warnings
        --------
        Much slower than doing :func:`addEcDescriptions` for several EC numbers in bulk!
        """
        from FEV_KEGG.KEGG import Database
        
        ecNumberIdToEcEnzyme = Database.getEcEnzymeBulk([self])
        ecEnzyme = ecNumberIdToEcEnzyme.get(self.uniqueID)
        if ecEnzyme is not None:
            self.description = ecEnzyme.description
            self.name = ecEnzyme.name
            self.reaction = ecEnzyme.reaction
    
[docs]    @staticmethod
    def addEcDescriptions(ecNumbers: Iterable):
        """
        Query KEGG for further descriptions and add them to each EC number in `ecNumbers`.
        """
        from FEV_KEGG.KEGG import Database
        
        ecNumberIdToEcEnzyme = Database.getEcEnzymeBulk(ecNumbers)
        for ecNumber in ecNumbers:
            ecEnzyme = ecNumberIdToEcEnzyme.get(ecNumber.uniqueID)
            if ecEnzyme is not None:
                ecNumber.description = ecEnzyme.description
                ecNumber.name = ecEnzyme.name
                ecNumber.reaction = ecEnzyme.reaction
    
    def __lt__(self, other):
        return self.ecNumberLevelsInteger < other.ecNumberLevelsInteger
    
    def __gt__(self, other):
        return self.ecNumberLevelsInteger > other.ecNumberLevelsInteger
    
    def __le__(self, other):
        return self.ecNumberLevelsInteger <= other.ecNumberLevelsInteger
    
    def __ge__(self, other):
        return self.ecNumberLevelsInteger >= other.ecNumberLevelsInteger

    
    
[docs]class GeneID(Element):
    
    REGEX_PATTERN = re.compile('^[a-z]{3,4}:[a-zA-Z0-9_\-\.]+$')
    
    def __init__(self, geneIDString: 'eco:b0004'):
        """
        Represents am enzyme of metabolism by gene ID, e.g. 'eco:b0004'.
        
        Parameters
        ----------
        geneIDString : str
            Gene ID represented by a string, e.g. 'eco:b0004'. Will be checked for correct formatting!
        
        Attributes
        ----------
        self.geneIDString : str
        
        Raises
        ------
        ValueError
            If gene ID is not formatted correctly.
        """
        # check input
        if self.__class__.REGEX_PATTERN.match(geneIDString) is None: # wrong format
            raise ValueError('Gene ID not formatted correctly: ' + geneIDString)
        
        # determine unique ID
        Element.__init__(self, geneIDString)
        
        # save object attributes
        self.geneIDString = self.uniqueID
        
    @property
    def organismAbbreviation(self) -> str:
        """
        Returns
        -------
        str
            'eco' from 'eco:b0004'.
        """
        geneIDSplit = self.geneIDString.split(':')
        organismAbbreviation = geneIDSplit[0]
        return organismAbbreviation
    
    @property
    def geneName(self) -> str:
        """
        Returns
        -------
        str
            'b0004' from 'eco:b0004'.
        """
        geneIDSplit = self.geneIDString.split(':')
        geneName = geneIDSplit[1]
        return geneName
    
        
    
[docs]class KeggOrthologyID(Element):
    
    REGEX_PATTERN = re.compile('^K[0-9]{5}$')
    
    def __init__(self, keggOrthologyIDString: 'K01733'):
        """
        Represents an enzyme of metabolism by KEGG Orthology ID.
        
        Parameters
        ----------
        keggOrthologyIDString : str
            String representation of a KEGG Orthology ID. Will be checked for correct formatting!
        
        Attributes
        ----------
        self.keggOrthologyIDString : str
        
        Raises
        ------
        ValueError
            If KEGG Orthology ID is not formatted correctly.
        """
        # check input
        if self.__class__.REGEX_PATTERN.match(keggOrthologyIDString) is None: # wrong format
            raise ValueError('KEGG Orthology ID not formatted correctly: ' + keggOrthologyIDString)
        
        # determine unique ID
        Element.__init__(self, keggOrthologyIDString)
        
        # save object attributes
        self.keggOrthologyIDString = self.uniqueID