Source code for FEV_KEGG.KEGG.Organism

from builtins import str
import re

from FEV_KEGG.lib.Biopython.KEGG.KGML import KGML_pathway
from FEV_KEGG.KEGG.DataTypes import Gene
from typing import Set, List, Iterable, Generator

from FEV_KEGG.Graph.SubstanceGraphs import SubstanceEnzymeGraph, SubstanceEcGraph, SubstanceGeneGraph, SubstanceReactionGraph, Conversion
from FEV_KEGG.KEGG import Database
from FEV_KEGG.KEGG.Database import NoKnownPathwaysError
from FEV_KEGG.KEGG.File import cache, cacheEntry
import tqdm
import FEV_KEGG.settings as settings

import concurrent.futures
from FEV_KEGG.Util import Parallelism
import gc
import FEV_KEGG.quirks as quirks
from math import ceil



[docs]class Organism(object): GLOBAL_PATHWAY_PATTERN = re.compile('01[12][0-9]{2}') """ Pattern defining a global/overview pathway. These pathways should contain nothing more and nothing less than what is included in all non-global pathways. At least concerning metabolic pathways. Alas, they do not. Global/Overview pathways are often inconsistent with the union of all pathways. Also, they discard information about edge direction, containing only undirected edges. """ _DIGITS_PATTERN = re.compile('\d+') def __init__(self, nameAbbreviation: 'eco', skipExistsCheck = False): """ An Organism as listed in KEGG, e.g. Escherichia coli K-12 MG1655 "eco". Checks whether the organism actually exists before creating the object. Parameters ---------- nameAbbreviation : str The abbreviation of an organism as used in KEGG. skipExistsCheck : bool, optional If *True*, skips the check for existence of an organism identified by `nameAbbreviation`. Any subsequent method access may raise an error if the organism does not exist in KEGG! Attributes ---------- self.nameAbbreviation : str Raises ------ ValueError If the organism does not exist at all in KEGG. URLError If the connection to the KEGG server fails and the requested organism has not already been cached. Note ---- All operations are cached via :class:`FEV_KEGG.KEGG.Database`. This includes any downloads and the calculations available directly via this class. """ self.nameAbbreviation = nameAbbreviation if skipExistsCheck is False and Database.doesOrganismExist(nameAbbreviation) is False: raise ValueError('Organism abbreviation does not exist: ' + nameAbbreviation)
[docs] @classmethod def __initBulk__(cls, nameAbbreviations: List[str]) -> List['Organism']: """ Creates many :class:`Organism` objects at once. Checking for existence is much faster when done this way, because it is parallelised. Pathologically non-existent organisms are automatically filtered, see :attr:`FEV_KEGG.quirks.NON_EXISTING_ORGANISMS` Parameters ---------- nameAbbreviations : list[str] List of abbreviation strings which will be used to create an :class:`Organism`. Returns ------- List[Organism] List of :class:`Organism` objects. Or *None* if none of the `nameAbbreviations` existed. Raises ------ ValueError If any organism does not exist at all in KEGG. URLError If the connection to the KEGG server fails and the requested organism has not already been cached. """ existingOrganisms = Database.doesOrganismExistBulk(nameAbbreviations) if len(existingOrganisms) == 0: return None else: if len(nameAbbreviations) != len(existingOrganisms): # some organism do not exist nonExistingOrganisms = set(nameAbbreviations).difference_update(set(existingOrganisms)) raise ValueError('Organism abbreviations do not exist: ' + ', '.join(nonExistingOrganisms)) organisms = [] # filter quirky organisms, without any pathway for nonExisting in quirks.NON_EXISTING_ORGANISMS: while True: try: existingOrganisms.remove(nonExisting) except ValueError: break for abbreviation in existingOrganisms: organism = cls(abbreviation, skipExistsCheck = True) organisms.append(organism) return organisms
def __str__(self): return 'Organism(' + self.nameAbbreviation + ')' def __eq__(self, other): if isinstance(self, other.__class__): return self.nameAbbreviation == other.nameAbbreviation return False def __ne__(self, other): return not self == other def __hash__(self): return self.nameAbbreviation.__hash__() def __lt__(self, other): return self.nameAbbreviation.lower() < other.nameAbbreviation.lower() def __gt__(self, other): return self.nameAbbreviation.lower() > other.nameAbbreviation.lower() def __le__(self, other): return self.nameAbbreviation.lower() <= other.nameAbbreviation.lower() def __ge__(self, other): return self.nameAbbreviation.lower() >= other.nameAbbreviation.lower()
[docs] def getPathway(self, pathwayName: '00260') -> KGML_pathway.Pathway: """ Gets a certain pathway for this organism. Parameters ---------- pathwayName : str The name/number of the pathway, e.g. "00260". This will be expanded with `self.nameAbbreviation` to, e.g. "eco:00260". Returns ------- Pathway The pathway object, or *None* if such a pathway does not exist. Raises ------ HTTPError If pathway does not exist. URLError If connection to KEGG fails. """ return Database.getPathway(self.nameAbbreviation, pathwayName)
[docs] def getPathways(self, includeOverviewMaps = False) -> Set[KGML_pathway.Pathway]: """ Gets a set of all pathway objects for this organism. Parameters ---------- includeOverviewMaps : bool, optional Whether to include global/overview maps. Returns ------- Set[Pathway] Set of pathways objects for all known pathways. Raises ------ NoKnownPathwaysError If the organism has no known pathways. HTTPError If pathway does not exist. URLError If connection to KEGG fails. """ return self.getPathwaysFromNames(self.getPathwayNames(self.getPathwayIDs(self.getPathwayDescriptions(includeOverviewMaps))))
[docs] def getMetabolicPathways(self, includeOverviewMaps = False) -> Set[KGML_pathway.Pathway]: """ Gets a set of all metabolic pathway objects for this organism. Parameters ---------- includeOverviewMaps : bool, optional Whether to include global/overview maps. Returns ------- Set[Pathway] Set of pathways objects for all known metabolic pathways. Raises ------ HTTPError If pathway does not exist. URLError If connection to KEGG fails. """ return self.getPathwaysFromNames(self.getPathwayNames(self.getPathwayIDs(self.getMetabolicPathwayDescriptions(includeOverviewMaps))))
@property def metabolicPathways(self) -> Set[KGML_pathway.Pathway]: """ Pathways of metabolism, without overview or global maps. Returns ------- Set[Pathway] Set of pathways objects for all known metabolic pathways, excluding global/overview pathways. Raises ------ HTTPError If pathway does not exist. URLError If connection to KEGG fails. """ return self.getMetabolicPathways(includeOverviewMaps = False)
[docs] def getPathwaysFromNames(self, pathwayNameSet: Set[str]) -> Set[KGML_pathway.Pathway]: """ Gets a set of pathway objects for this organism, based on a set of pathway names, eg. {'00260', '01100'}. Parameters ---------- pathwayNameSet : Set[str] The names/numbers of the pathways, e.g. '00260'. This will be expanded with `self.nameAbbreviation` to, e.g. 'eco00260'. Returns ------- Set[Pathway] Set of pathways objects for all pathways from `pathwayNameSet`. If pathway exists, but has no KGML format, the entry for this pathway is *None*. Raises ------ HTTPError If pathway does not exist. URLError If connection to KEGG fails. """ return Database.getPathwayBulk(self.nameAbbreviation, pathwayNameSet).values()
[docs] def getPathwayDescriptions(self, includeOverviewMaps = False) -> Set[str]: """ Get pathway descriptions for this organism. Parameters ---------- includeOverviewMaps : bool, optional Whether to include global/overview maps. Returns ------- Set[str] Set of pathway descriptions for all known pathways. Raises ------ NoKnownPathwaysError If the organism has no known pathways. HTTPError If any other HTTP error occurs. URLError If connection to KEGG fails. """ descriptions = Database.getPathwayDescriptions(self.nameAbbreviation) if includeOverviewMaps == True: return descriptions else: return self.__class__._filterGlobalAndOverview(descriptions)
[docs] @classmethod def _filterGlobalAndOverview(cls, pathwayDescriptions: Set[str]) -> Set[str]: """ Removes pathway descriptions of pathways belonging to global or overview maps. Parameters ---------- pathwayDescriptions : Set[str] The pathway descriptions to filter. Returns ------- Set[str] Pathway descriptions, leaving only the ones **not** from a global/overview pathway. """ newSet = set() for pathwayString in pathwayDescriptions: if cls.GLOBAL_PATHWAY_PATTERN.search(pathwayString) is None: # not a global/overview map newSet.add(pathwayString) return newSet
[docs] def getMetabolicPathwayDescriptions(self, includeOverviewMaps = False) -> Set[str]: """ Get descriptions of pathways that are part of metabolism. Parameters ---------- includeOverviewMaps : bool, optional Whether to include global/overview maps. Returns ------- Set[str] Set of pathway descriptions for all known metabolic pathways. Raises ------ NoKnownPathwaysError If the organism has no known pathways. HTTPError If pathway description list should not exist. Which would be odd, if we are certain and tested that the organism itself exists. URLError If connection to KEGG fails. """ descriptions = self.getPathwayDescriptions(includeOverviewMaps) return self._filterNonMetabolic(descriptions)
[docs] def _filterNonMetabolic(self, pathwayDescriptions: Set[str]) -> Set[str]: """ Removes pathway descriptions of pathways not belonging to metabolism. Parameters ---------- pathwayDescriptions : Set[str] The pathway descriptions to filter. Returns ------- Set[str] Pathway descriptions, leaving only the ones from pathways belonging to metabolism. See Also -------- FEV_KEGG.quirks.METABOLIC_PATHWAYS : List of names for all pathways belonging to metabolism. """ newSet = set() for pathwayString in pathwayDescriptions: tempSet = set() tempSet.add(pathwayString) if self.getPathwayNames(self.getPathwayIDs(tempSet)).pop() in quirks.METABOLIC_PATHWAYS: # is a metabolism pathway newSet.add(pathwayString) return newSet
[docs] def getPathwayIDs(self, pathwayDescriptions: Set[str]) -> Set[str]: """ Get pathway IDs from a set of descriptions. A pathway ID is the occurence of a specific pathway in a specific organism, e.g. pathway '00260' in 'eco' -> 'eco00260'. Parameters ---------- pathwayDescriptions : Set[str] The pathway descriptions to be searched, e.g. 'path:eco00260 Glycine, serine and threonine metabolism - Escherichia coli K-12 MG1655'. Returns ------- Set[str] Pathway IDs, e.g. 'eco00260'. """ pathwayIDSet = set() for pathway in pathwayDescriptions: pathwayID = pathway.split('\t')[0].replace('path:','') pathwayIDSet.add(pathwayID) return pathwayIDSet
[docs] def getPathwayNames(self, pathwayIDs: Set[str]) -> Set[str]: """ Get pathway names from a set of IDs. A pathway name is a specific pathway, independent from any organism. Parameters ---------- pathwayDescriptions : Set[str] The pathway IDs to be searched, e.g. 'eco00260'. Returns ------- Set[str] Pathway name, e.g. '00260'. """ pathwayNameSet = set() for pathwayID in pathwayIDs: pathwayNameSet.add(pathwayID.replace(self.nameAbbreviation, '')) return pathwayNameSet
# Gene ====================================================================================
[docs] def getGene(self, gene: 'eco:b0004 or b0004') -> Gene: """ Get a certain gene object for this organism. Automatically recognises format. Parameters ---------- gene : str Gene ID or name, e.g. 'eco:b0004' or 'b0004'. Returns ------- Gene Gene object. Raises ------ HTTPError If gene does not exist. URLError If connection to KEGG fails. """ if ':' in gene: return self.getGeneByID(gene) else: return self.getGeneByName(gene)
[docs] def getGeneByName(self, geneName: 'b0004') -> Gene: """ Get a certain gene object for this organism. Automatically prepends organism, eg. 'eco:'+geneName. Parameters ---------- geneName : str Gene name, e.g. 'b0004'. Returns ------- Gene Gene object. Raises ------ HTTPError If gene does not exist. URLError If connection to KEGG fails. """ gene = Database.getGene(self.nameAbbreviation + ':' + geneName) return gene
[docs] def getGeneByID(self, geneID: 'eco:b0004') -> Gene: """ Get a certain gene object for this organism. Does not check if the prefix matches this organism! Parameters ---------- geneID : str Gene name, e.g. 'eco:b0004'. Returns ------- Gene Gene object. Raises ------ HTTPError If gene does not exist. URLError If connection to KEGG fails. """ gene = Database.getGene(geneID) return gene
[docs] def getGeneIDs(self, pathway: 'KGML_pathway.Pathway or 00260') -> Set[str]: """ Get the set of all gene IDs of this organism in a certain pathway. Automatically chooses :func:`getGeneIDsByName` or :func:`getGeneIDsByPathway`, depending on the type of `pathway`. Deduplicates original list. Parameters ---------- pathway : Pathway or str The pathway to search, either as :class:`FEV_KEGG.lib.Biopython.KEGG.KGML.KGML_pathway.Pathway` object or its name as a string, e.g. '00260'. Returns ------- Set[str] List of gene IDs in `pathway`, e.g. ['eco:b0632', 'eco:b0839', 'eco:b2010']. Raises ------ HTTPError If name passed and pathway does not exist. URLError If name passed and connection fails. """ if pathway.__class__ == KGML_pathway.Pathway: return self.getGeneIDsByPathway(pathway) else: return self.getGeneIDsByName(pathway)
[docs] def getGeneIDsByPathway(self, pathway: KGML_pathway.Pathway) -> Set[str]: """ Get the set of all gene IDs of this organism for a :class:`FEV_KEGG.lib.Biopython.KEGG.KGML.KGML_pathway.Pathway` object. Deduplicates original list. Parameters ---------- pathway : Pathway The pathway to search. Returns ------- Set[str] List of gene IDs in `pathway`, e.g. ['eco:b0632', 'eco:b0839', 'eco:b2010']. """ pathwayNumber = self.__class__._DIGITS_PATTERN.findall(pathway.name)[0] geneNameList = Database.getPathwayGeneIDs(self.nameAbbreviation, pathwayNumber) # try to get list from disk # if not on disk, calculate the list if geneNameList == None: geneNameList = self._calculateGeneIDs(pathway, pathwayNumber) return geneNameList
[docs] def getGeneIDsByName(self, pathwayName: '00260') -> Set[str]: """ Get the set of all gene IDs of this organism for a pathway name. Deduplicates original list. Parameters ---------- pathwayName : str The pathway name to search, e.g. '00260'. Returns ------- Set[str] List of gene IDs in `pathwayName`, e.g. ['eco:b0632', 'eco:b0839', 'eco:b2010']. Raises ------ HTTPError If pathway does not exist. URLError If connection to KEGG fails. """ geneNameList = Database.getPathwayGeneIDs(self.nameAbbreviation, pathwayName) # try to get list from disk # if not on disk, calculate the list if geneNameList == None: geneNameList = self._calculateGeneIDs(self.getPathway(pathwayName), pathwayName) return geneNameList
def _calculateGeneIDs(self, pathway: KGML_pathway.Pathway, pathwayName: '00260') -> Set[str]: geneNameSet = set() geneDescriptionList = pathway.genes for geneDescription in geneDescriptionList: namePossiblyList = geneDescription.name nameList = namePossiblyList.split(' ') for name in nameList: geneNameSet.add(name) Database.setPathwayGeneIDs(self.nameAbbreviation, pathwayName, geneNameSet) # cache the deduplicated list to disk return geneNameSet
[docs] def getGenes(self, pathway: 'KGML_pathway.Pathway or 00260') -> Set[Gene]: """ Get the set of all genes of this organism in a certain pathway. Automatically chooses :func:`getGenesByName` or :func:`getGenesByPathway`, depending on the type of `pathway`. Deduplicates original list. Parameters ---------- pathway : Pathway or str The pathway to search, either as :class:`FEV_KEGG.lib.Biopython.KEGG.KGML.KGML_pathway.Pathway` object or its name as a string, e.g. '00260'. Returns ------- Set[Gene] List of gene IDs in `pathway`, e.g. ['eco:b0632', 'eco:b0839', 'eco:b2010']. Raises ------ HTTPError If name passed and pathway does not exist. URLError If name passed and connection fails. """ if pathway.__class__ == KGML_pathway.Pathway: return self.getGenesByPathway(pathway) else: return self.getGenesByName(pathway)
[docs] def getGenesByName(self, pathwayName: '00260') -> Set[Gene]: """ Get the set of all genes of this organism for a pathway name. Deduplicates original list. Parameters ---------- pathwayName : str The pathway name to search, e.g. '00260'. Returns ------- Set[Gene] List of gene IDs in `pathwayName`, e.g. ['eco:b0632', 'eco:b0839', 'eco:b2010']. Raises ------ HTTPError If pathway does not exist. URLError If connection to KEGG fails. """ geneList = set() geneIDList = self.getGeneIDsByName(pathwayName) for geneID in geneIDList: geneList.add(self.getGeneByID(geneID)) return geneList
[docs] def getGenesByPathway(self, pathway: KGML_pathway.Pathway) -> Set[Gene]: """ Get the set of all genes of this organism for a :class:`FEV_KEGG.lib.Biopython.KEGG.KGML.KGML_pathway.Pathway` object. Deduplicates original list. Parameters ---------- pathway : Pathway The pathway to search. Returns ------- Set[Gene] List of gene IDs in `pathway`, e.g. ['eco:b0632', 'eco:b0839', 'eco:b2010']. """ geneList = set() geneIDList = self.getGeneIDsByPathway(pathway) for geneID in geneIDList: geneList.add(self.getGeneByID(geneID)) return geneList
[docs] def getNumberOfGenes(self): """ Get the number of known genes within this genome. Returns ------- int Count of genes in this organism's genome. These do not necessarily have to be mentioned in any pathway. """ organismInfo = Database.getOrganismInfo(self.nameAbbreviation, checkExpiration = False) return Database._extractGeneEntries(organismInfo)
# Graph ====================================================================================
[docs] def substanceEcGraph(self, noMultifunctional = settings.defaultNoMultifunctional, returnCacheEntry = False) -> SubstanceEcGraph: """ Substance-EC graph of this organism. Parameters ---------- noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. returnCacheEntry : bool, optional If *True*, do not return the graph, but instead a :class:`FEV_KEGG.KEGG.File.CacheEntry`. This cache entry can be useful for parallel computation. Returns ------- SubstanceEcGraph The graph has substrates/products as its nodes and EC numbers as the connecting edges. Edges have a direction. Raises ------ HTTPError If any gene or pathway does not exist. URLError If connection to KEGG fails. """ file_name = 'SubstanceEcGraph' if noMultifunctional is True: file_name += '_noMultifunctional' folder_path = 'organism/' + self.nameAbbreviation + '/graph' if returnCacheEntry is False: # shall return result decorator = cache(folder_path = folder_path, file_name = file_name) else: # shall return CacheEntry object decorator = cacheEntry(folder_path = folder_path, file_name = file_name) function = lambda: Conversion.SubstanceEnzymeGraph2SubstanceEcGraph(self.substanceEnzymeGraph(noMultifunctional)) return decorator(function)()
[docs] def substanceEnzymeGraph(self, noMultifunctional = settings.defaultNoMultifunctional, returnCacheEntry = False) -> SubstanceEnzymeGraph: """ Substance-Enzyme graph of this organism. Parameters ---------- noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. returnCacheEntry : bool, optional If *True*, do not return the graph, but instead a :class:`FEV_KEGG.KEGG.File.CacheEntry`. This cache entry can be useful for parallel computation. Returns ------- SubstanceEnzymeGraph The graph has substrates/products as its nodes and enzymes as the connecting edges. Edges have a direction. Raises ------ HTTPError If any gene or pathway does not exist. URLError If connection to KEGG fails. """ file_name = 'SubstanceEnzymeGraph' if noMultifunctional is True: file_name += '_noMultifunctional' folder_path = 'organism/' + self.nameAbbreviation + '/graph' if returnCacheEntry is False: # shall return result decorator = cache(folder_path = folder_path, file_name = file_name) else: # shall return CacheEntry object decorator = cacheEntry(folder_path = folder_path, file_name = file_name) function = lambda: Conversion.SubstanceGeneGraph2SubstanceEnzymeGraph(self.substanceGeneGraph(), noMultifunctional) return decorator(function)()
[docs] def substanceGeneGraph(self, returnCacheEntry = False) -> SubstanceGeneGraph: """ Substance-Gene graph of this organism. Parameters ---------- returnCacheEntry : bool, optional If *True*, do not return the graph, but instead a :class:`FEV_KEGG.KEGG.File.CacheEntry`. This cache entry can be useful for parallel computation. Returns ------- SubstanceGeneGraph The graph has substrates/products as its nodes and genes as the connecting edges. Edges have a direction. Raises ------ HTTPError If any gene or pathway does not exist. URLError If connection to KEGG fails. """ file_name = 'SubstanceGeneGraph' folder_path = 'organism/' + self.nameAbbreviation + '/graph' if returnCacheEntry is False: # shall return result decorator = cache(folder_path = folder_path, file_name = file_name) else: # shall return CacheEntry object decorator = cacheEntry(folder_path = folder_path, file_name = file_name) function = lambda: Conversion.SubstanceReactionGraph2SubstanceGeneGraph(self.substanceReactionGraph()) return decorator(function)()
[docs] def substanceReactionGraph(self, returnCacheEntry = False) -> SubstanceReactionGraph: """ Substance-Reaction graph of this organism. Parameters ---------- returnCacheEntry : bool, optional If *True*, do not return the graph, but instead a :class:`FEV_KEGG.KEGG.File.CacheEntry`. This cache entry can be useful for parallel computation. Returns ------- SubstanceReactionGraph The graph has substrates/products as its nodes and reactions as the connecting edges. Edges have a direction. Raises ------ HTTPError If any gene or pathway does not exist. URLError If connection to KEGG fails. """ file_name = 'SubstanceReactionGraph' folder_path = 'organism/' + self.nameAbbreviation + '/graph' if returnCacheEntry is False: # shall return result decorator = cache(folder_path = folder_path, file_name = file_name) else: # shall return CacheEntry object decorator = cacheEntry(folder_path = folder_path, file_name = file_name) function = lambda: Conversion.KeggPathwaySet2SubstanceReactionGraph(self.getMetabolicPathways(), name = self.nameAbbreviation) return decorator(function)()
[docs]class Group(object): def __init__(self, organismAbbreviations: Iterable[str] = None, searchString: 'any part of organism description' = None, name = None, minimalSize = None): """ A Group of :class:`Organism`. If both parameters, `organismAbbreviations` and `searchString`, are specified, both lists will be appended, forming this group's list of organisms. If none of the parameters is specified, this Group has an empty organism list. Parameters ---------- organismAbbreviations : Iterable[str], optional Abbreviations of the desired organisms. If != *None*, tries to find one :class:`Organism` for each abbreviation in the list. searchString : str, optional Any part of the desired organisms' description. If != *None*, searches the list of all organisms known to KEGG for the passed string. An example entry of the KEGG list of organisms looks as follows: *"T00338 eci Escherichia coli O18:K1:H7 UTI89 (UPEC) Prokaryotes;Bacteria;Gammaproteobacteria - Enterobacteria;Escherichia"*. Any list entry matching the search string creates one :class:`Organism`, aggregated into this group. name : str, optional Custom name of this group. minimalSize : int, optional If not *None*, incorporate only organisms with EC graphs with at least `minimalSize` edges. Can be useful to filter incompletely annotated organisms. Attributes ---------- self.searchString : str self.name : str Raises ------ ValueError If any organism does not exist at all in KEGG. URLError If connection to KEGG fails. """ self._collectiveEcGraph = None self._collectiveEcGraph_noMultifunctional = None self._collectiveEnzymeGraph = None self._collectiveEnzymeGraph_noMultifunctional = None self.searchString = searchString self.__organisms = set() if searchString is not None: organismList = Database.getOrganismList() matchList = [] for entry in organismList: if searchString in entry: entrySplit = entry.split('\t') matchList.append(entrySplit[1]) organisms = Organism.__initBulk__(matchList) self.__organisms.update(organisms) if organismAbbreviations is not None: listObject = [] listObject.extend(organismAbbreviations) organisms = Organism.__initBulk__(listObject) self.__organisms.update(organisms) self.name = name if minimalSize is not None: # count sizes of each organisms graph organismsToRemove = set() for organism in organisms: ecGraph = organism.substanceEcGraph(noMultifunctional = False) if len(ecGraph.getEdges()) < minimalSize: organismsToRemove.add(organism) self.__organisms.difference_update( organismsToRemove ) def __str__(self): return 'Group(' + self.organisms + ')' def __eq__(self, other): if isinstance(self, other.__class__): return self.organisms == other.organisms return False def __ne__(self, other): return not self == other def __hash__(self): return self.organisms.__hash__()
[docs] def freeHeap(self): """ Free heap of memory-cached data. Removes objects kept on heap, i.e. which have a pointer kept in this object, because some function was called with `keepOnHeap` == *True*. Also calls garbage collector to break reference cycles in previously uncollected objects (generation 0). Note ---- Instead of using this, you will most likely want to never use any of the group methods with `keepOnHeap` == *True*. Then, calling this method would be unnecessary, as it would have no effect. """ self._collectiveEcGraph = None self._collectiveEcGraph_noMultifunctional = None self._collectiveEnzymeGraph = None self._collectiveEnzymeGraph_noMultifunctional = None gc.collect(0)
@property def organisms(self) -> Set[Organism]: """ Organisms of this group. Returns ------- Set[Organism] The set of organisms which are part of this group. Order is arbitrary. """ return self.__organisms @property def organismsCount(self) -> int: """ Number of organisms of this group. Returns ------- int The number of organisms in the set of organisms of this group. """ return len( self.__organisms )
[docs] def enzymeGraphs(self, noMultifunctional = settings.defaultNoMultifunctional) -> List[SubstanceEnzymeGraph]: """ All substance-enzyme graphs of this group. Parameters ---------- noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. Returns ------- List[SubstanceEnzymeGraph] Substance-enzyme graphs of all organisms in this group. Order is arbitrary. Raises ------ TypeError If you failed to enable :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or to provide a :attr:`FEV_KEGG.Util.Parallelism.processPool`. See :func:`_getGraphsParallelly`. HTTPError If fetching any of the underlying graphs fails. URLError If connection to KEGG fails. """ return self._getGraphsParallelly(self._enzymeGraphsWorker, self.organisms, noMultifunctional, 'enzyme graphs')
def _enzymeGraphsWorker(self, organism: Organism, noMultifunctional, returnCacheEntry) -> SubstanceEnzymeGraph: return organism.substanceEnzymeGraph(noMultifunctional, returnCacheEntry)
[docs] def ecGraphs(self, noMultifunctional = settings.defaultNoMultifunctional, minimalSize = None) -> List[SubstanceEcGraph]: """ All substance-EC graphs of this group. Parameters ---------- noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. minimalSize : int, optional If not *None*, return only EC graphs with at least `minimalSize` edges. Can be useful to filter incompletely annotated organisms. Returns ------- List[SubstanceEcGraph] Substance-EC graphs of all organisms in this group. Order is arbitrary. Raises ------ TypeError If you failed to enable :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or to provide a :attr:`FEV_KEGG.Util.Parallelism.processPool`. See :func:`_getGraphsParallelly`. HTTPError If fetching any of the underlying graphs fails. URLError If connection to KEGG fails. """ return self._getGraphsParallelly(self._ecGraphsWorker, self.organisms, noMultifunctional, 'EC graphs', minimalSize = minimalSize)
def _ecGraphsWorker(self, organism: Organism, noMultifunctional, returnCacheEntry) -> SubstanceEcGraph: return organism.substanceEcGraph(noMultifunctional, returnCacheEntry)
[docs] def _getGraphsParallelly(self, worker, organisms, noMultifunctional, debugText, minimalSize = None): """ Does the actual fetching and computing of the graphs in parallel. Raises ------ TypeError If you failed to enable :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or to provide a :attr:`FEV_KEGG.Util.Parallelism.processPool`. Warnings -------- If you did not enable parallel computation by either enabling :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or providing :attr:`FEV_KEGG.Util.Parallelism.processPool`, this will fail with a TypeError. Note ---- If you enabled :attr:`FEV_KEGG.settings.automaticallyStartProcessPool`, this will run parallelly in multiple processes with multiple threads in each, depending on your settings and computer. """ threadPool = concurrent.futures.ThreadPoolExecutor(Parallelism.getNumberOfThreadsFile()) futures = [] futuresIO = [] futuresGenerator = None resultFutures = None try: # submit work to process pool for organism in organisms: if Parallelism.processPool is None: raise TypeError("Process pool does not exist. Did you forget to FEV_KEGG.startProcessPool()?") futures.append( Parallelism.processPool.submit( worker, organism, noMultifunctional, True ) ) futuresGenerator = concurrent.futures.as_completed( futures ) # add progress bar if settings.verbosity >= 1: if settings.verbosity >= 2: print( 'Fetching ' + debugText + ' from ' + str(len(organisms)) + ' organisms...' ) futuresGenerator = tqdm.tqdm(futuresGenerator, total = len(organisms), unit = ' organisms', position = 0) # when any work item in process pool finishes for future in futuresGenerator: try: cacheEntry = future.result() except KeyboardInterrupt: raise except concurrent.futures.CancelledError: Parallelism.printBelowProgress( "Future cancelled. Continuing anyway..." ) continue except concurrent.futures.TimeoutError: Parallelism.printBelowProgress( "Future timed out. Continuing anyway..." ) continue except NoKnownPathwaysError as e: # organism has no known pathways, ignore it Parallelism.printBelowProgress( "Future raised error: " + str(e) + " Ignoring this organism." ) continue except Exception: # any non-exiting error Parallelism.printBelowProgress( "Future raised error, see stack trace above. Halting by KeyboardInterrupt..." ) raise KeyboardInterrupt() futuresIO.append( threadPool.submit(cacheEntry.getResult) ) resultFutures = concurrent.futures.as_completed( futuresIO ) if settings.verbosity >= 2: if settings.verbosity >= 2: print( 'Doing I/O for ' + str(len(organisms)) + ' organisms...' ) resultFutures = tqdm.tqdm(resultFutures, total = len(organisms), unit = ' organism I/Os', position = 0) graphs = [] for future in resultFutures: graph = future.result() if minimalSize is None or len(graph.getEdges()) >= minimalSize: graphs.append( graph ) return graphs except KeyboardInterrupt: # only raised in main thread (once in each process!) Parallelism.keyboardInterruptHandler(processPoolFutures=futures, threadPool=threadPool, threadPoolFutures=futuresIO, terminateProcess=True) raise except BaseException: if Parallelism.isMainThread(): Parallelism.keyboardInterruptHandler(processPoolFutures=futures, threadPool=threadPool, threadPoolFutures=futuresIO, silent=True) raise finally: if threadPool is not None: threadPool.shutdown(wait = False) if futuresGenerator is not None: futuresGenerator.close() if resultFutures is not None: resultFutures.close() Parallelism.printBelowProgress(None)
[docs] def collectiveEcGraph(self, noMultifunctional = settings.defaultNoMultifunctional, addCount = False, keepOnHeap = True, addEcDescriptions = False) -> SubstanceEcGraph: """ The collective of all SubstanceEcGraphs, by union operation, from all organisms in this group. Nodes of the same Substance are merged, all edges of differing ECs with a unique pair of nodes survive. Parameters ---------- noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. addCount : bool, optional If *True*, the returned graph contains extra dicts. 1 ``graph.nodeCounts[node]`` = number of organisms which contained this node. 2 ``graph.edgeCounts[(node, node, element)]`` = number of organisms which contained this edge. 3 ``graph.edgeElementCounts[element]`` = number of organisms which contained this element. Attention! These counter dictionaries are **NOT** updated if your add or remove a node/edge/element! keepOnHeap : bool, optional Keeps a common graph in memory to speed up subsequent calls of this or other methods. This can take up a lot of memory! Once this object is garbage collected, the common graph will be, too. Returns ------- SubstanceEcGraph The substance-EC graph composed of all this group's organism's substance-EC graphs. Raises ------ TypeError If you failed to enable :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or to provide a :attr:`FEV_KEGG.Util.Parallelism.processPool`. See :func:`_getGraphsParallelly`. HTTPError If fetching any of the underlying graphs fails. URLError If connection to KEGG fails. See Also -------- FEV_KEGG.Graph.Models.CommonGraphApi.union : Union operator. """ if keepOnHeap is True: # shall keep the result on heap # check first if it already is on heap if noMultifunctional is True: collectiveEcGraph = self._collectiveEcGraph_noMultifunctional else: collectiveEcGraph = self._collectiveEcGraph if collectiveEcGraph is not None: return collectiveEcGraph.copy() if settings.verbosity >= 1: print('calculating collective EC graph...') allSubstanceEcGraphs = self.ecGraphs(noMultifunctional) if isinstance(allSubstanceEcGraphs, Generator): lastGraph = allSubstanceEcGraphs.next() else: lastGraph = allSubstanceEcGraphs.pop() collectiveGraph = lastGraph.union(allSubstanceEcGraphs, addCount = addCount) collectiveGraph.removeIsolatedNodes() if self.name is None: collectiveGraph.name = 'Collective EC graph' else: collectiveGraph.name = 'Collective EC graph ' + self.name if addEcDescriptions is True: collectiveGraph.addEcDescriptions() if keepOnHeap is True: # shall keep the result on heap # save the calculated result on heap if noMultifunctional is True: self._collectiveEcGraph_noMultifunctional = collectiveGraph else: self._collectiveEcGraph = collectiveGraph return collectiveGraph.copy()
[docs] def consensusEcGraph(self, noMultifunctional = settings.defaultNoMultifunctional, keepOnHeap = True) -> SubstanceEcGraph: """ The consensus of all SubstanceEcGraphs, by intersection operation, from all organisms in this group. Afterwards, removes isolated nodes. Parameters ---------- noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. keepOnHeap : bool, optional Keeps a common graph in memory to speed up subsequent calls of this or other methods. This can take up a lot of memory! Once this object is garbage collected, the common graph will be, too. Returns ------- SubstanceEcGraph The substance-EC graph intersected of all this group's organism's substance-EC graphs. Raises ------ TypeError If you failed to enable :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or to provide a :attr:`FEV_KEGG.Util.Parallelism.processPool`. See :func:`_getGraphsParallelly`. HTTPError If fetching any of the underlying graphs fails. URLError If connection to KEGG fails. See Also -------- FEV_KEGG.Graph.Models.CommonGraphApi.intersection : Intersection operator. """ if keepOnHeap is True: collectiveEcGraph = self.collectiveEcGraph(noMultifunctional, addCount = True, keepOnHeap = True) edgeCounts = collectiveEcGraph.edgeCounts edgesToBeRemoved = [] for edge, count in edgeCounts.items(): if count < self.organismsCount: # has not occured in enough organisms edgesToBeRemoved.append(edge) collectiveEcGraph.removeEcEdges(edgesToBeRemoved) collectiveEcGraph.removeIsolatedNodes() if self.name is None: collectiveEcGraph.name = 'Consensus EC graph' else: collectiveEcGraph.name = 'Consensus EC graph ' + self.name return collectiveEcGraph else: allSubstanceEcGraphs = self.ecGraphs(noMultifunctional) if isinstance(allSubstanceEcGraphs, Generator): lastGraph = allSubstanceEcGraphs.next() else: lastGraph = allSubstanceEcGraphs.pop() consensusGraph = lastGraph.intersection(allSubstanceEcGraphs) consensusGraph.removeIsolatedNodes() if self.name is None: consensusGraph.name = 'Consensus EC graph' else: consensusGraph.name = 'Consensus EC graph ' + self.name return consensusGraph
[docs] def majorityEcGraph(self, majorityPercentage = 90, majorityTotal = None, noMultifunctional = settings.defaultNoMultifunctional, keepOnHeap = True) -> SubstanceEcGraph: """ The majority-consensus of all SubstanceEcGraphs, by majority-intersection operation, from all organisms in this group. If the majority of organisms contains an edge, it is added to the majority-consensus. Afterwards, removes isolated nodes. Parameters ---------- majorityPercentage : float, optional Majority percentage means 'at least x%' and is rounded up. For example 90% of 11 organisms would be ceiling(9,9) = 10 organisms. majorityTotal : int, optional If given (not *None*), `majorityPercentage` is ignored and the percentage of organisms for a majority is calculated from `majorityTotal` alone. noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. keepOnHeap : bool, optional Keeps a common graph in memory to speed up subsequent calls of this or other methods. This can take up a lot of memory! Once this object is garbage collected, the common graph will be, too. Returns ------- SubstanceEcGraph The substance-EC graph majority-intersected of all this group's organism's substance-EC graphs. Raises ------ TypeError If you failed to enable :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or to provide a :attr:`FEV_KEGG.Util.Parallelism.processPool`. See :func:`_getGraphsParallelly`. HTTPError If fetching any of the underlying graphs fails. URLError If connection to KEGG fails. See Also -------- FEV_KEGG.Graph.Models.CommonGraphApi.majorityIntersection : Majority intersection operator. """ if majorityTotal is not None: percentage = majorityTotal / self.organismsCount * 100 else: percentage = majorityPercentage if keepOnHeap is True: # check if majorityPercentage is sane if majorityPercentage <= 0 or majorityPercentage > 100: raise ValueError('Majority percentage is not a sane value (0 < percentage <= 100): ' + str(majorityPercentage)) majorityTotal = ceil((majorityPercentage/100) * self.organismsCount) collectiveEcGraph = self.collectiveEcGraph(noMultifunctional, addCount = True, keepOnHeap = True) edgeCounts = collectiveEcGraph.edgeCounts edgesToBeRemoved = [] for edge, count in edgeCounts.items(): if count < majorityTotal: # has not occured in enough organisms edgesToBeRemoved.append(edge) collectiveEcGraph.removeEcEdges(edgesToBeRemoved) collectiveEcGraph.removeIsolatedNodes() if self.name is None: collectiveEcGraph.name = 'Majority EC graph' else: collectiveEcGraph.name = 'Majority EC graph ' + self.name return collectiveEcGraph else: allSubstanceEcGraphs = self.ecGraphs(noMultifunctional) if isinstance(allSubstanceEcGraphs, Generator): lastGraph = allSubstanceEcGraphs.next() else: lastGraph = allSubstanceEcGraphs.pop() majorityGraph = lastGraph.majorityIntersection(allSubstanceEcGraphs, percentage) majorityGraph.removeIsolatedNodes() if self.name is None: majorityGraph.name = 'Majority EC graph' else: majorityGraph.name = 'Majority EC graph ' + self.name return majorityGraph
[docs] def collectiveEnzymeGraph(self, noMultifunctional = settings.defaultNoMultifunctional, keepOnHeap = True) -> SubstanceEnzymeGraph: """ The collective of all SubstanceEnzymeGraphs, by union operation, from all organisms in this group. Nodes of the same Substance are merged, all edges of differing Enzymes with a unique pair of nodes survive. Enzymes are compared by their GeneID and should, thus, all be different! Parameters ---------- noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. keepOnHeap : bool, optional Keeps a common graph in memory to speed up subsequent calls of this or other methods. This can take up a lot of memory! Once this object is garbage collected, the common graph will be, too. Returns ------- SubstanceEnzymeGraph The substance-enzyme graph composed of all this group's organism's substance-enzyme graphs. Raises ------ TypeError If you failed to enable :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or to provide a :attr:`FEV_KEGG.Util.Parallelism.processPool`. See :func:`_getGraphsParallelly`. HTTPError If fetching any of the underlying graphs fails. URLError If connection to KEGG fails. See Also -------- FEV_KEGG.Graph.Models.CommonGraphApi.union : Union operator. """ if keepOnHeap is True: # shall keep the result on heap # check first if it already is on heap if noMultifunctional is True: collectiveEnzymeGraph = self._collectiveEnzymeGraph_noMultifunctional else: collectiveEnzymeGraph = self._collectiveEnzymeGraph if collectiveEnzymeGraph is not None: return collectiveEnzymeGraph.copy() if settings.verbosity >= 1: print('calculating collective enzyme graph...') allSubstanceEnzymeGraphs = self.enzymeGraphs(noMultifunctional) if isinstance(allSubstanceEnzymeGraphs, Generator): lastGraph = allSubstanceEnzymeGraphs.next() else: lastGraph = allSubstanceEnzymeGraphs.pop() collectiveGraph = lastGraph.union(allSubstanceEnzymeGraphs) if self.name is None: collectiveGraph.name = 'Collective Enzyme graph' else: collectiveGraph.name = 'Collective Enzyme graph ' + self.name if keepOnHeap is True: # shall keep the result on heap # save the calculated result on heap if noMultifunctional is True: self._collectiveEnzymeGraph_noMultifunctional = collectiveGraph else: self._collectiveEnzymeGraph = collectiveGraph return collectiveGraph.copy()
[docs] def collectiveEnzymeGraphByEcConsensus(self, noMultifunctional = settings.defaultNoMultifunctional) -> SubstanceEnzymeGraph: """ The collective SubstanceEnzymGraph, but containing only Enzymes whose EC numbers occur in the consensus of all SubstanceEcGraphs. Parameters ---------- noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. Raises ------ TypeError If you failed to enable :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or to provide a :attr:`FEV_KEGG.Util.Parallelism.processPool`. See :func:`_getGraphsParallelly`. HTTPError If fetching any of the underlying graphs fails. URLError If connection to KEGG fails. """ return self.collectiveEnzymeGraphByEcMajority(majorityPercentage = 100, majorityTotal = None, noMultifunctional = noMultifunctional)
[docs] def collectiveEnzymeGraphByEcMajority(self, majorityPercentage = 80, majorityTotal = None, noMultifunctional = settings.defaultNoMultifunctional) -> SubstanceEnzymeGraph: """ The collective SubstanceEnzymGraph, but containing only Enzymes whose EC numbers occur in the majority of all SubstanceEcGraphs. Parameters ---------- majorityPercentage : float, optional Majority percentage means 'at least x%' and is rounded up. For example 90% of 11 organisms would be ceiling(9,9) = 10 organisms. majorityTotal : int, optional If given (not *None*), `majorityPercentage` is ignored and the percentage of organisms for a majority is calculated from `majorityTotal` alone. noMultifunctional : bool, optional If *True*, ignore enzymes with multiple EC numbers. Raises ------ TypeError If you failed to enable :attr:`FEV_KEGG.settings.automaticallyStartProcessPool` or to provide a :attr:`FEV_KEGG.Util.Parallelism.processPool`. See :func:`_getGraphsParallelly`. HTTPError If fetching any of the underlying graphs fails. URLError If connection to KEGG fails. """ if majorityPercentage >= 100: ecNumbers = self.consensusEcGraph(noMultifunctional).getECs() else: ecNumbers = self.majorityEcGraph(majorityPercentage, majorityTotal, noMultifunctional).getECs() collectiveEnzymeGraph = self.collectiveEnzymeGraph(noMultifunctional) collectiveEnzymeGraph.keepEnzymesByEC(ecNumbers) collectiveEnzymeGraph.name += ' by EC majority' return collectiveEnzymeGraph