Source code for FEV_KEGG.Evolution.Comparison

from FEV_KEGG.Graph.Elements import Enzyme, GeneID
from typing import Set, Iterable, Dict
from FEV_KEGG.KEGG.Database import getOrthologsBulk
from FEV_KEGG import settings

[docs]def getOrthologsWithinGeneIDs(geneIDs: Iterable[GeneID], eValue : float = settings.defaultEvalue) -> Dict[GeneID, Set[GeneID]]:
    """
    Get orthologs within `geneIDs`.
    
    Parameters
    ----------
    geneIDs : Iterable[GeneID]
        Gene IDs among which to search for orthology.
    eValue : float, optional
        Statistical expectation value (E-value), below which a sequence alignment is considered significant.
    
    Returns
    -------
    Dict[GeneID, Set[GeneID]]
        Dictionary of every gene in `geneIDs` which has at least one ortholog in `geneIDs`, pointing to a set of all its orthologs.
    
    Raises
    ------
    ValueError
        If any organism does not exist.
    URLError
        If connection to KEGG fails.
    
    Warnings
    --------
    This operation has a worst-case complexity class of O(n(n+1)/2)! To make matters worse, each step involves at least one, if not several, network operations with KEGG SSDB/GENE, which are inherently slow.
    The best-case complexity class is O(n-1).
    """
    # sort enzymes by their organism
    genesByOrganism = dict()
    for geneID in geneIDs:
        organismAbbreviation = geneID.organismAbbreviation
        organismSet = genesByOrganism.get(organismAbbreviation)
        if organismSet is None:
            genesByOrganism[organismAbbreviation] = set()
        genesByOrganism[organismAbbreviation].add(geneID)
    
    orthologousGenes = dict()
    
    # search orthologs, for each organism's genes in every other organism
    partiallySearchedOrganisms = list(genesByOrganism.keys())
    for organismAbbreviation in genesByOrganism.keys():
        # get all other organisms
        partiallySearchedOrganisms.remove(organismAbbreviation)
        if len(partiallySearchedOrganisms) == 0:
            break
        
        # for each other organism            
        # get orthologs for all of current organism's genes
        currentOrganism_Genes = genesByOrganism[organismAbbreviation]
        matchingsDict = getOrthologsBulk(currentOrganism_Genes, partiallySearchedOrganisms, eValue)
        
        # link found orthologous genes to searched genes
        for searchedGeneID, matchingList in matchingsDict.items():
            for matching in matchingList:
                matchedGeneIDs = {x.foundGeneID for x in matching.matches}
                matchedGeneIDs.intersection_update(geneIDs) # leave only genes we actually search for
                
                if len(matchedGeneIDs) > 0:
                    orthologousGenesSet = orthologousGenes.get(searchedGeneID)
                    if orthologousGenesSet is None:
                        orthologousGenes[searchedGeneID] = set()
                    orthologousGenes[searchedGeneID].update(matchedGeneIDs)
        
    return orthologousGenes

[docs]def getOrthologsWithinEnzymes(enzymes: Iterable[Enzyme], eValue : float = settings.defaultEvalue) -> Dict[Enzyme, Set[Enzyme]]:
    """
    Get orthologs within `enzymes`.
    
    Parameters
    ----------
    enzymes : Iterable[Enzyme]
        Enzymes among which to search for orthology.
    eValue : float, optional
        Statistical expectation value (E-value), below which a sequence alignment is considered significant.
    
    Returns
    -------
    Dict[Enzyme, Set[Enzyme]]
        Dictionary of every enzyme in `enzymes` which has at least one ortholog in `enzymes`, pointing to a set of all its orthologs.
    
    Raises
    ------
    ImpossiblyOrthologousError
        If any gene ID in `geneIDs` is from `comparisonOrganism`.
    ValueError
        If any organism does not exist.
    URLError
        If connection to KEGG fails.
    
    Warnings
    --------
    This operation has a worst-case complexity class of O(n(n+1)/2)! To make matters worse, each step involves at least one, if not several, network operations with KEGG SSDB/GENE, which are inherently slow.
    The best-case complexity class is still O(n-1).
    """
    # create reverse mapping for GeneID -> Enzyme
    gene2Enzyme = dict()
    for enzyme in enzymes:
        gene2Enzyme[enzyme.geneID] = enzyme
    
    orthologousGenesDict = getOrthologsWithinGeneIDs([x.geneID for x in enzymes], eValue)
    
    # reverse-map found GeneID -> Enzyme
    orthologousEnzymes = dict()
    for geneID, orthologousGeneIDs in orthologousGenesDict.items():
        orthologousEnzymes[gene2Enzyme[geneID]] = {gene2Enzyme[geneID] for geneID in orthologousGeneIDs}
    
    return orthologousEnzymes