Source code for FEV_KEGG.KEGG.DataTypes

from _collections import defaultdict
import string

[docs]class Gene(object):
    
    _digit_keeper = defaultdict(type(None))
    _digit_keeper.update({ord(c): c for c in string.digits})
    
    def __init__(self, content):
        """
        Gene as defined by a gene description file in KEGG GENE database.
        
        All attributes might be *None*, depending on whether they actually occur in the gene description! Occurence varies between organisms and sources.
        
        Parameters
        ----------
        content : str
            A multi-line gene description.
        
        Attributes
        ----------
        self.number : str
            The name of the gene, e.g. 'Acav_0021'.
        self.symbol : str
            Colloquial name of the gene product, e.g. 'ThrC'.
        self.name : str
            Long name of the gene, e.g. '(GenBank) Homoserine dehydrogenase'.
        
        self.isProtein : bool
        self.ecNumbers : List[str]
        self.isEnzyme : bool
            If `isProtein` and has EC numbers.
        
        self.keggOrthologyNames : List[str]
            Names for each KEGG Orthology ID associated with this gene, e.g. 'homoserine dehydrogenase'.
        self.keggOrthologyIDs : List[str]
            Each KEGG Orthology ID assocaited with this gene, e.g. 'K00003'.
        self.keggOrthologies : List[Tuple[str, str, List[str]]]
            List of each associated KEGG Orthology entry, represented as a tuple of (ID, name, EC numbers), e.g. ('K00003', 'homoserine dehydrogenase', ['1.1.1.3']).
        
        self.organismAbbreviation : str
        self.organismName : str
        self.organismTnumber : str
            KEGG Onthology ID of the organism this gene belongs to, e.g. 'T01445'.
        
        self.pathways : List[Tuple[str, str]]
            List of organism-specific pathways this gene occurs in, represented as a tuple of (ID, name), e.g. ('aaa00260', 'Glycine, serine and threonine metabolism').
        
        self.positionFrom : int
            Nucleotide sequence position this gene starts at.
        self.positionTo : int
            Nucleotide sequence positon this gene ends at.
        self.positionIsComplement : bool
            Whether the positions are on the complement strand.
        
        self.aaseqLength : int
        self.aaseq : str
        
        self.ntseqLength : int
        self.ntseq : str
        """
        
        if isinstance(content, str):
            linesList = content.splitlines()
        
        try:
            # default values
            self.number = None
            self.isProtein = False
            self.organismTnumber = None
            
            self.name = None
            
            self.symbol = None
            
            self.ecNumbers = list()
            self.keggOrthologyNames = list()
            self.keggOrthologyIDs = list()
            self.keggOrthologies = list()
            self.isEnzyme = False
            
            self.organismAbbreviation = None
            self.organismName = None
            
            self.pathways = list()
            
            self.positionFrom = None
            self.positionTo = None
            self.positionIsComplement = False
            
            self.aaseqLength = None
            self.aaseq = None
            
            self.ntseqLength = None
            self.ntseq = None
            
            # parse file data
            currentSection = None
            currentContent = None
            
            for line in linesList:
                
                if len(line) == 0 or line[0] == ' ': # section content
                    
                    if currentSection is not None:
                        currentContent.append(line.lstrip())
                    
                else: # section beginning
                    
                    # process previous section
                    lastSection = currentSection
                    
                    if lastSection is not None:
                        if lastSection == 'ENTRY':
                            
                            nextWords = currentContent[0].split()
                            self.number = nextWords[0]
                            self.isProtein = nextWords[1] == 'CDS'
                            self.organismTnumber = nextWords[2]
                            
                        elif lastSection == 'NAME':
                            
                            self.name = currentContent[0]
                        
                        elif lastSection == 'SYMBOL':
                            
                            self.symbol = currentContent[0]
                        
                        elif lastSection == 'ORTHOLOGY':
                            
                            for contentLine in currentContent:
                                
                                keggOrthologyID, rest = contentLine.split('  ') # two spaces
                                self.keggOrthologyIDs.append(keggOrthologyID)
                                
                                restSplit = rest.split(' [EC:')
                                if len(restSplit) > 1: # has EC number and long name
                                    
                                    self.isEnzyme = True
                                    ecNumbers = restSplit[1][:-1].split(' ')
                                    longName = restSplit[0]
                                    
                                    self.ecNumbers.extend(ecNumbers) # one space
                                    self.keggOrthologyNames.append(longName)
                                
                                elif len(restSplit) == 1: # has only long name
                                    
                                    ecNumbers = None
                                    longName = restSplit[0]
                                    
                                    self.keggOrthologyNames.append(longName)
                                    
                                else: # has nothing
                                    
                                    ecNumbers = None
                                    longName = None
                                
                                self.keggOrthologies.append( (keggOrthologyID, longName, ecNumbers) )
                        
                        elif lastSection == 'ORGANISM':
                            
                            split = currentContent[0].split('  ') # two spaces
                            self.organismAbbreviation = split[0]
                            self.organismName = split[1]
                        
                        elif lastSection == 'PATHWAY':
                            
                            for contentLine in currentContent:
                                
                                pathwayID, pathwayName = contentLine.split('  ') # two spaces
                                
                                self.pathways.append( (pathwayID, pathwayName) )
                        
                        elif lastSection == 'POSITION':
                            
                            split = currentContent[0].split(':')
                            
                            if len(split) > 1: # there was a colon
                                split = split[1]
                            else:
                                split = split[0]
                            
                            split = split.split('..')
                            if 'complement' in currentContent[0]:
                                self.positionIsComplement = True
                            
                            if len(split) == 0 or len(split) == 1 and (split[0] == 'X' or split[0] == 'Y' or split[0] == 'Unknown'):
                                self.positionFrom = None
                                self.positionTo = None
                            
                            elif len(split) == 1:
                                self.positionFrom = int( split[0].translate(self.__class__._digit_keeper) )    
                                self.positionTo = None
                            
                            else:
                                self.positionFrom = int( split[0].translate(self.__class__._digit_keeper) )
                                self.positionTo = int( split[1].translate(self.__class__._digit_keeper) )
                        
                        elif lastSection == 'AASEQ':
                            
                            self.aaseqLength = int(currentContent[0])
                            self.aaseq = ''.join(currentContent[1:])
                            
                        elif lastSection == 'NTSEQ':
                            
                            self.ntseqLength = int(currentContent[0])
                            self.ntseq = ''.join(currentContent[1:])
                            
                    
                    # begin reading next section
                    split = line.split(maxsplit = 1)
                    
                    firstWord = split[0]
                    if firstWord.startswith('///'):
                        break
                    
                    else:
                        if len(split) > 1:
                            restLine = split[1]
                            currentContent = [restLine]
                        else:
                            currentContent = []
                            
                        currentSection = firstWord
        
        except:
            print( "Error while parsing a gene description into a KEGG.DataTypes.Gene object:" )
            print( content )
            raise
        
[docs]    def getGeneID(self) -> 'GeneID':
        from FEV_KEGG.Graph.Elements import GeneID
        return GeneID(self.organismAbbreviation + ':' + self.number)



[docs]class Substance(object):
    
    def __init__(self, content):
        """
        A compound/glycan found in KEGG pathways.
        
        Depending on whether this is a compound or a glycan, there are more attributes than listed below.
        
        Attributes
        ----------
        self.uniqueID : str
            Unique string identifying a substance, e.g. 'C00084'.
        self.description : str
            Human-readable description of a substance, e.g. 'Acetaldehyde;Ethanal'.
        self.name : str
            Short human-readable description of a substance, e.g. 'Acetaldehyde'. The shortest of all words in `self.description`.
        """
        try:
            # determine whether this is a compound or glycan
            if not isinstance(content, str):
                raise ValueError('Substance content is not a string.')
            
            linesList = content.splitlines()
            
            if len(linesList) <= 1:
                raise ValueError('Substance content has only one line.')
            
            # default values
            self.description = ''
            self.shortestDescription = ''
            self.firstDescription = ''
            
            firstLine = linesList[0]
            if 'Glycan' in firstLine:
                self._parseGlycan(linesList)
            elif 'Compound' in firstLine:
                self._parseCompound(linesList)
            else:
                raise ValueError('Substance type unknown.')
            
            self.name = self.firstDescription
            
        except:
            print( "Error while parsing a substance description into a KEGG.DataTypes.Substance object:" )
            print( content )
            raise
    
    def _parseCompound(self, linesList):
        
        # parse file data
        currentSection = None
        currentContent = None
        
        for line in linesList:
                
            if len(line) == 0 or line[0] == ' ': # section content
                
                if currentSection is not None:
                    currentContent.append(line.lstrip())
                
            else: # section beginning
                
                # process previous section
                lastSection = currentSection
                
                if lastSection is not None:
                    if lastSection == 'ENTRY':
                        
                        nextWords = currentContent[0].split()
                        self.uniqueID = nextWords[0]
#                         self.entry = currentContent[0]
                        
                    elif lastSection == 'NAME':
                        
                        self.description = ' \n'.join(currentContent)
                        self.shortestDescription = min(currentContent, key=len).replace(';','')
                        self.firstDescription = currentContent[0].replace(';','')
                    
                    elif lastSection == 'FORMULA':
                        
                        self.formula = currentContent[0]
                    
                    elif lastSection == 'EXACT_MASS':
                        
                        self.exact_mass = currentContent[0]
                    
                    elif lastSection == 'MOL_WEIGHT':
                        
                        self.mol_weight = currentContent[0]
                    
                        
                
                # begin reading next section
                split = line.split(maxsplit = 1)
                
                firstWord = split[0]
                if firstWord.startswith('///'):
                    break
                
                else:
                    if len(split) > 1:
                        restLine = split[1]
                        currentContent = [restLine]
                    else:
                        currentContent = []
                        
                    currentSection = firstWord
    
    def _parseGlycan(self, linesList):
        
        # parse file data
        currentSection = None
        currentContent = None
        
        for line in linesList:
                
            if len(line) == 0 or line[0] == ' ': # section content
                
                if currentSection is not None:
                    currentContent.append(line.lstrip())
                
            else: # section beginning
                
                # process previous section
                lastSection = currentSection
                
                if lastSection is not None:
                    if lastSection == 'ENTRY':
                        
#                         self.entry = currentContent[0]
                        nextWords = currentContent[0].split()
                        self.uniqueID = nextWords[0]
                        
                    elif lastSection == 'COMPOSITION':
                        
                        self.description = ' \n'.join(currentContent)
                        self.shortestDescription = min(currentContent, key=len).replace(';','')
                        self.firstDescription = currentContent[0].replace(';','')
#                         self.composition = self.description
                    
                    elif lastSection == 'MASS':
                        
                        self.mass = currentContent[0]
                    
                        
                
                # begin reading next section
                split = line.split(maxsplit = 1)
                
                firstWord = split[0]
                if firstWord.startswith('///'):
                    break
                
                else:
                    if len(split) > 1:
                        restLine = split[1]
                        currentContent = [restLine]
                    else:
                        currentContent = []
                        
                    currentSection = firstWord


[docs]class EcEnzyme(object):
    
    def __init__(self, content):
        """
        An enzyme found in KEGG pathways, defined by its EC number.
        
        Attributes
        ----------
        self.uniqueID : str
            Unique string identifying the EC number, e.g. '4.1.2.48'.
        self.description : str
            Human-readable description of the EC number, e.g. 'low-specificity L-threonine aldolase;LtaE'.
        self.name : str
            Short human-readable name of the EC number, e.g. 'LtaE'. The shortest of all words in `self.description`.
        """
        try:
            # determine whether this is a compound or glycan
            if not isinstance(content, str):
                raise ValueError('Enzyme content is not a string.')
            
            linesList = content.splitlines()
            
            if len(linesList) <= 1:
                raise ValueError('Enzyme content has only one line.')
            
            # default values
            self.description = ''
            self.shortestDescription = ''
            self.firstDescription = ''
            self.reaction = ''
            
            # parse file data
            currentSection = None
            currentContent = None
            
            for line in linesList:
                    
                if len(line) == 0 or line[0] == ' ': # section content
                    
                    if currentSection is not None:
                        currentContent.append(line.lstrip())
                    
                else: # section beginning
                    
                    # process previous section
                    lastSection = currentSection
                    
                    if lastSection is not None:
                        if lastSection == 'ENTRY':
                            
    #                         self.entry = currentContent[0]
                            nextWords = currentContent[0].split()
                            self.uniqueID = nextWords[1]
                            
                        elif lastSection == 'NAME':
                            
                            self.description = ' \n'.join(currentContent)
                            self.shortestDescription = min(currentContent, key=len).replace(';','')
                            self.firstDescription = currentContent[0].replace(';','')
    #                         self.composition = self.description
                        
                        elif lastSection == 'REACTION':
                            
                            self.reaction = ' \n'.join(currentContent)
                        
                            
                    
                    # begin reading next section
                    split = line.split(maxsplit = 1)
                    
                    firstWord = split[0]
                    if firstWord.startswith('///'):
                        break
                    
                    else:
                        if len(split) > 1:
                            restLine = split[1]
                            currentContent = [restLine]
                        else:
                            currentContent = []
                            
                        currentSection = firstWord
                        
            self.name = self.firstDescription
            
        except:
            print( "Error while parsing an enzyme description into a KEGG.DataTypes.EcEnzyme object:" )
            print( content )
            raise