from _collections import defaultdict
import string
[docs]class Gene(object):
_digit_keeper = defaultdict(type(None))
_digit_keeper.update({ord(c): c for c in string.digits})
def __init__(self, content):
"""
Gene as defined by a gene description file in KEGG GENE database.
All attributes might be *None*, depending on whether they actually occur in the gene description! Occurence varies between organisms and sources.
Parameters
----------
content : str
A multi-line gene description.
Attributes
----------
self.number : str
The name of the gene, e.g. 'Acav_0021'.
self.symbol : str
Colloquial name of the gene product, e.g. 'ThrC'.
self.name : str
Long name of the gene, e.g. '(GenBank) Homoserine dehydrogenase'.
self.isProtein : bool
self.ecNumbers : List[str]
self.isEnzyme : bool
If `isProtein` and has EC numbers.
self.keggOrthologyNames : List[str]
Names for each KEGG Orthology ID associated with this gene, e.g. 'homoserine dehydrogenase'.
self.keggOrthologyIDs : List[str]
Each KEGG Orthology ID assocaited with this gene, e.g. 'K00003'.
self.keggOrthologies : List[Tuple[str, str, List[str]]]
List of each associated KEGG Orthology entry, represented as a tuple of (ID, name, EC numbers), e.g. ('K00003', 'homoserine dehydrogenase', ['1.1.1.3']).
self.organismAbbreviation : str
self.organismName : str
self.organismTnumber : str
KEGG Onthology ID of the organism this gene belongs to, e.g. 'T01445'.
self.pathways : List[Tuple[str, str]]
List of organism-specific pathways this gene occurs in, represented as a tuple of (ID, name), e.g. ('aaa00260', 'Glycine, serine and threonine metabolism').
self.positionFrom : int
Nucleotide sequence position this gene starts at.
self.positionTo : int
Nucleotide sequence positon this gene ends at.
self.positionIsComplement : bool
Whether the positions are on the complement strand.
self.aaseqLength : int
self.aaseq : str
self.ntseqLength : int
self.ntseq : str
"""
if isinstance(content, str):
linesList = content.splitlines()
try:
# default values
self.number = None
self.isProtein = False
self.organismTnumber = None
self.name = None
self.symbol = None
self.ecNumbers = list()
self.keggOrthologyNames = list()
self.keggOrthologyIDs = list()
self.keggOrthologies = list()
self.isEnzyme = False
self.organismAbbreviation = None
self.organismName = None
self.pathways = list()
self.positionFrom = None
self.positionTo = None
self.positionIsComplement = False
self.aaseqLength = None
self.aaseq = None
self.ntseqLength = None
self.ntseq = None
# parse file data
currentSection = None
currentContent = None
for line in linesList:
if len(line) == 0 or line[0] == ' ': # section content
if currentSection is not None:
currentContent.append(line.lstrip())
else: # section beginning
# process previous section
lastSection = currentSection
if lastSection is not None:
if lastSection == 'ENTRY':
nextWords = currentContent[0].split()
self.number = nextWords[0]
self.isProtein = nextWords[1] == 'CDS'
self.organismTnumber = nextWords[2]
elif lastSection == 'NAME':
self.name = currentContent[0]
elif lastSection == 'SYMBOL':
self.symbol = currentContent[0]
elif lastSection == 'ORTHOLOGY':
for contentLine in currentContent:
keggOrthologyID, rest = contentLine.split(' ') # two spaces
self.keggOrthologyIDs.append(keggOrthologyID)
restSplit = rest.split(' [EC:')
if len(restSplit) > 1: # has EC number and long name
self.isEnzyme = True
ecNumbers = restSplit[1][:-1].split(' ')
longName = restSplit[0]
self.ecNumbers.extend(ecNumbers) # one space
self.keggOrthologyNames.append(longName)
elif len(restSplit) == 1: # has only long name
ecNumbers = None
longName = restSplit[0]
self.keggOrthologyNames.append(longName)
else: # has nothing
ecNumbers = None
longName = None
self.keggOrthologies.append( (keggOrthologyID, longName, ecNumbers) )
elif lastSection == 'ORGANISM':
split = currentContent[0].split(' ') # two spaces
self.organismAbbreviation = split[0]
self.organismName = split[1]
elif lastSection == 'PATHWAY':
for contentLine in currentContent:
pathwayID, pathwayName = contentLine.split(' ') # two spaces
self.pathways.append( (pathwayID, pathwayName) )
elif lastSection == 'POSITION':
split = currentContent[0].split(':')
if len(split) > 1: # there was a colon
split = split[1]
else:
split = split[0]
split = split.split('..')
if 'complement' in currentContent[0]:
self.positionIsComplement = True
if len(split) == 0 or len(split) == 1 and (split[0] == 'X' or split[0] == 'Y' or split[0] == 'Unknown'):
self.positionFrom = None
self.positionTo = None
elif len(split) == 1:
self.positionFrom = int( split[0].translate(self.__class__._digit_keeper) )
self.positionTo = None
else:
self.positionFrom = int( split[0].translate(self.__class__._digit_keeper) )
self.positionTo = int( split[1].translate(self.__class__._digit_keeper) )
elif lastSection == 'AASEQ':
self.aaseqLength = int(currentContent[0])
self.aaseq = ''.join(currentContent[1:])
elif lastSection == 'NTSEQ':
self.ntseqLength = int(currentContent[0])
self.ntseq = ''.join(currentContent[1:])
# begin reading next section
split = line.split(maxsplit = 1)
firstWord = split[0]
if firstWord.startswith('///'):
break
else:
if len(split) > 1:
restLine = split[1]
currentContent = [restLine]
else:
currentContent = []
currentSection = firstWord
except:
print( "Error while parsing a gene description into a KEGG.DataTypes.Gene object:" )
print( content )
raise
[docs] def getGeneID(self) -> 'GeneID':
from FEV_KEGG.Graph.Elements import GeneID
return GeneID(self.organismAbbreviation + ':' + self.number)
[docs]class Substance(object):
def __init__(self, content):
"""
A compound/glycan found in KEGG pathways.
Depending on whether this is a compound or a glycan, there are more attributes than listed below.
Attributes
----------
self.uniqueID : str
Unique string identifying a substance, e.g. 'C00084'.
self.description : str
Human-readable description of a substance, e.g. 'Acetaldehyde;Ethanal'.
self.name : str
Short human-readable description of a substance, e.g. 'Acetaldehyde'. The shortest of all words in `self.description`.
"""
try:
# determine whether this is a compound or glycan
if not isinstance(content, str):
raise ValueError('Substance content is not a string.')
linesList = content.splitlines()
if len(linesList) <= 1:
raise ValueError('Substance content has only one line.')
# default values
self.description = ''
self.shortestDescription = ''
self.firstDescription = ''
firstLine = linesList[0]
if 'Glycan' in firstLine:
self._parseGlycan(linesList)
elif 'Compound' in firstLine:
self._parseCompound(linesList)
else:
raise ValueError('Substance type unknown.')
self.name = self.firstDescription
except:
print( "Error while parsing a substance description into a KEGG.DataTypes.Substance object:" )
print( content )
raise
def _parseCompound(self, linesList):
# parse file data
currentSection = None
currentContent = None
for line in linesList:
if len(line) == 0 or line[0] == ' ': # section content
if currentSection is not None:
currentContent.append(line.lstrip())
else: # section beginning
# process previous section
lastSection = currentSection
if lastSection is not None:
if lastSection == 'ENTRY':
nextWords = currentContent[0].split()
self.uniqueID = nextWords[0]
# self.entry = currentContent[0]
elif lastSection == 'NAME':
self.description = ' \n'.join(currentContent)
self.shortestDescription = min(currentContent, key=len).replace(';','')
self.firstDescription = currentContent[0].replace(';','')
elif lastSection == 'FORMULA':
self.formula = currentContent[0]
elif lastSection == 'EXACT_MASS':
self.exact_mass = currentContent[0]
elif lastSection == 'MOL_WEIGHT':
self.mol_weight = currentContent[0]
# begin reading next section
split = line.split(maxsplit = 1)
firstWord = split[0]
if firstWord.startswith('///'):
break
else:
if len(split) > 1:
restLine = split[1]
currentContent = [restLine]
else:
currentContent = []
currentSection = firstWord
def _parseGlycan(self, linesList):
# parse file data
currentSection = None
currentContent = None
for line in linesList:
if len(line) == 0 or line[0] == ' ': # section content
if currentSection is not None:
currentContent.append(line.lstrip())
else: # section beginning
# process previous section
lastSection = currentSection
if lastSection is not None:
if lastSection == 'ENTRY':
# self.entry = currentContent[0]
nextWords = currentContent[0].split()
self.uniqueID = nextWords[0]
elif lastSection == 'COMPOSITION':
self.description = ' \n'.join(currentContent)
self.shortestDescription = min(currentContent, key=len).replace(';','')
self.firstDescription = currentContent[0].replace(';','')
# self.composition = self.description
elif lastSection == 'MASS':
self.mass = currentContent[0]
# begin reading next section
split = line.split(maxsplit = 1)
firstWord = split[0]
if firstWord.startswith('///'):
break
else:
if len(split) > 1:
restLine = split[1]
currentContent = [restLine]
else:
currentContent = []
currentSection = firstWord
[docs]class EcEnzyme(object):
def __init__(self, content):
"""
An enzyme found in KEGG pathways, defined by its EC number.
Attributes
----------
self.uniqueID : str
Unique string identifying the EC number, e.g. '4.1.2.48'.
self.description : str
Human-readable description of the EC number, e.g. 'low-specificity L-threonine aldolase;LtaE'.
self.name : str
Short human-readable name of the EC number, e.g. 'LtaE'. The shortest of all words in `self.description`.
"""
try:
# determine whether this is a compound or glycan
if not isinstance(content, str):
raise ValueError('Enzyme content is not a string.')
linesList = content.splitlines()
if len(linesList) <= 1:
raise ValueError('Enzyme content has only one line.')
# default values
self.description = ''
self.shortestDescription = ''
self.firstDescription = ''
self.reaction = ''
# parse file data
currentSection = None
currentContent = None
for line in linesList:
if len(line) == 0 or line[0] == ' ': # section content
if currentSection is not None:
currentContent.append(line.lstrip())
else: # section beginning
# process previous section
lastSection = currentSection
if lastSection is not None:
if lastSection == 'ENTRY':
# self.entry = currentContent[0]
nextWords = currentContent[0].split()
self.uniqueID = nextWords[1]
elif lastSection == 'NAME':
self.description = ' \n'.join(currentContent)
self.shortestDescription = min(currentContent, key=len).replace(';','')
self.firstDescription = currentContent[0].replace(';','')
# self.composition = self.description
elif lastSection == 'REACTION':
self.reaction = ' \n'.join(currentContent)
# begin reading next section
split = line.split(maxsplit = 1)
firstWord = split[0]
if firstWord.startswith('///'):
break
else:
if len(split) > 1:
restLine = split[1]
currentContent = [restLine]
else:
currentContent = []
currentSection = firstWord
self.name = self.firstDescription
except:
print( "Error while parsing an enzyme description into a KEGG.DataTypes.EcEnzyme object:" )
print( content )
raise