Source code for FEV_KEGG.KEGG.Download

from builtins import int
import urllib

from FEV_KEGG.lib.Biopython.KEGG import REST
from bs4 import BeautifulSoup
from retrying import retry
import tqdm
from typing import List, Tuple

from FEV_KEGG.Util import Util, Parallelism
from FEV_KEGG.KEGG import SSDB
import FEV_KEGG.settings as settings
import concurrent.futures
import re


[docs]def is_not_404(exception):
    """
    Checks if `exception` is **not** an HTTP 404 error.
    
    This is useful, because for most types of downloads 404 means 'does not exist' and is in itself a valid reply. Therefore, retrying a download because it raised a 404 error is usually unnecessary.
    
    Parameters
    ----------
    exception : Exception
        Any error class instance. Must be of class :class:`urllib.error.HTTPError` to be recognised as a 404 error.
    
    Returns
    -------
    bool
        *False*, if `exception` is an HTTP 404 error. *True*, if it is anythign else.
    """
    return not ( isinstance(exception, urllib.error.HTTPError) and exception.code == 404 )


[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404)
def downloadPathwayList(organismString: 'eco') -> str:
    """
    Downloads list of all pathways for a given organism from KEGG.
    
    Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
    
    Parameters
    ----------
    organismString : str
        Abbreviation of the organism, e.g. 'eco'.
    
    Returns
    -------
    str
        List of pathways, delimited by '\\\\n'.
    
    Raises
    ------
    HTTPError
        If pathway list does not exist.
    URLError
        If connection to KEGG fails.
    """
    return REST.kegg_list('pathway', organismString, timeout=settings.downloadTimeoutSocket).read()


[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadPathway(organismString: 'eco', pathwayName: '00260') -> str:
    """
    Downloads pathway as KGML for a given organism from KEGG.
    
    Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
    
    Parameters
    ----------
    organismString : str
        Abbreviation of the organism, e.g. 'eco'.
    pathwayName : str
        Name of the pathway, e.g. '00260'. Will be automatically concatenated with `organismString` to form the pathway ID, e.g. 'eco:00260'.
    
    Returns
    -------
    str
        Pathway in KGML format.
    
    Raises
    ------
    HTTPError
        If pathway does not exist.
    URLError
        If connection to KEGG fails.
    """
    return REST.kegg_get(organismString + pathwayName, 'kgml', timeout=settings.downloadTimeoutSocket).read()
    

[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def downloadGene(geneID: 'eco:b0004') -> str:
    """
    Downloads gene description for a given gene ID (includes organism) from KEGG.
    
    Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
    
    Parameters
    ----------
    geneID : str
        ID of the gene, including organism abbreviation, e.g. 'eco:b0004'.
    
    Returns
    -------
    str
        Gene in KEGG GENE format.
    
    Raises
    ------
    HTTPError
        If gene does not exist.
    URLError
        If connection to KEGG fails.
    """
    result = REST.kegg_get(geneID, timeout=settings.downloadTimeoutSocket).read()
    if len( result ) < 3:
        raise urllib.error.HTTPError( "Download too small:\n" + result)
    else:
        return result

[docs]def downloadGeneBulk(geneIDs: '[eco:b0004, eco:b0015,...]') -> str:
    """
    Downloads gene descriptions for a given list of gene IDs (includes organism) from KEGG
    
    Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
    
    Parameters
    ----------
    geneIDs : Iterable
        IDs of the genes, including organism abbreviation, e.g. '[eco:b0004, eco:b0015,...]'.
    
    Returns
    -------
    str
        Genes in KEGG GENE format, delimited by a line of '///'. You will have to split them! Order is arbitrary.
    
    Raises
    ------
    IOError
        If result is too small. Possibly because none of the genes exist.
    URLError
        If connection to KEGG fails.
    """    
    max_query_count = 10 # hard limit imposed by KEGG server
    
    # split list of GeneIDs into chunks of size max_query_count
    geneIDs_chunks = Util.chunks(geneIDs, max_query_count)
    
    # form sub-queries
    query_parts = []
    for chunk in geneIDs_chunks:
        query_parts.append( '+'.join(chunk) )
    
    tqdmPosition = Parallelism.getTqdmPosition()
    threadPool = concurrent.futures.ThreadPoolExecutor(Parallelism.getNumberOfThreadsDownload())
    futures = []
    iterator = None
    
    try:
        # query KEGG in parallel
        
        for query_part in query_parts:
            futures.append( threadPool.submit(_downloadGeneBulk, query_part) )
        
        iterator = concurrent.futures.as_completed(futures)
        
        if settings.verbosity >= 1:
            if settings.verbosity >= 2:
                print( 'Downloading ' + str(len(geneIDs)) + ' genes, max. ' + str(max_query_count) + ' per chunk...' )
            iterator = tqdm.tqdm(iterator, total = len(query_parts), unit = ' *10 genes', position = tqdmPosition)
        
        result = ''
        for future in iterator:
            try:
                result_part = future.result()
            except concurrent.futures.CancelledError:
                continue
            result += result_part
            
        threadPool.shutdown(wait = False)
    
    except KeyboardInterrupt: # only raised in main thread (once in each process!)
        
        Parallelism.keyboardInterruptHandler(threadPool=threadPool, threadPoolFutures=futures, terminateProcess=True)
        raise
    
    except BaseException:
        
        if Parallelism.isMainThread():
            Parallelism.keyboardInterruptHandler(threadPool=threadPool, threadPoolFutures=futures, silent=True)
        raise
        
    finally:
        
        if iterator is not None: iterator.close()
    
    return result

@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def _downloadGeneBulk(query_part):
    if Parallelism.getShallCancelThreads() is True:
        raise concurrent.futures.CancelledError()
    else:
        result = REST.kegg_get(query_part, timeout=settings.downloadTimeoutSocket).read()
        if len( result ) < 3:
            raise IOError( "Download too small:\n" + result)
        else:
            return result


[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def downloadOrganismList() -> str:
    """
    Download the list of all organisms known to KEGG.
    
    Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
    
    Returns
    -------
    str
        List of organism descriptions known to KEGG, delimited by '\\\\n'.
    
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    return REST.kegg_list('organism', timeout=settings.downloadTimeoutSocket).read()


[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadEnzymeEcNumbers(enzymeAbbreviation) -> str:
    """
    Download the list of all EC numbers for a given enzyme, identified by its abbreviation, from KEGG.
    
    Also works for everything else in the description of an enzyme, not just the abbreviation.
    Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
    
    Parameters
    ----------
    enzymeAbbreviation : str
        Common abbreviation of the desired enzyme, as it appears in its description, e.g. 'MiA'. Also works for everything else in the description of an enzyme, not just the abbreviation.
    
    Returns
    -------
    str
        EC numbers, delimited by '\\\\n'.
    
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    ecNumbers = []
    
    # look up enzyme EC numbers
    searchResult = REST.kegg_find('enzyme', enzymeAbbreviation, timeout=settings.downloadTimeoutSocket).read().split('\n')
    for line in searchResult:
        
        if len( line ) < 10:
            continue
        
        ecNumber = line.split('\t')[0].split(':')[1]
        ecNumbers.append(ecNumber)
    
    return '\n'.join(ecNumbers)


[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadOrganismInfo(organismAbbreviation) -> str:
    """
    Downloads the info file of an organism.
    
    Parameters
    ----------
    organismAbbreviation : str
        Abbreviation of the organism to check, e.g. 'eco'.
    
    Returns
    -------
    str
        Raw organism info. *None*, if download was empty (400 Bad Request), because this organism does not exist.
        
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    try:
        return REST.kegg_info(organismAbbreviation, timeout=settings.downloadTimeoutSocket).read()
    except urllib.error.HTTPError as e:
        if isinstance(e, urllib.error.HTTPError) and e.code == 400:
            return None
        else:
            raise




[docs]def downloadOrthologs(geneID: 'GeneID', comparisonOrganismString: 'eco') -> Tuple[int, List[SSDB.PreMatch]]:
    """
    Download orthologs of gene `geneID` found in organism `comparisonOrganismString`.
    
    Parameters
    ----------
    geneID : GeneID
        GeneID object of the gene to be compared against, i.e. against its amino acid sequence.
    comparisonOrganismString : str
        Abbreviation of the organism to search for orthologs of `geneID`.
    
    Returns
    -------
    List[SSDB.PreMatch]
        List of Pre-Matches, containing gene IDs of orthologs, and other data necessary for sequence matching. Will be empty, if nothing is found.
    
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    # download list of orthologs
    data = _downloadHomologs(geneID.geneIDString, comparisonOrganismString)
    
    # parse HTML
    orthologData = _parseSsdbOrthologView(data)
    if orthologData is None:
        print( 'Failed to get SSDB data for ' + str(geneID) + ' in ' + comparisonOrganismString)
        return None
    
    foundGenes = orthologData
    
    return foundGenes
    

[docs]def downloadParalogs(geneID: 'GeneID') -> Tuple[int, List[SSDB.PreMatch]]:
    """
    Download paralogs of gene `geneID`.
    
    Parameters
    ----------
    geneID : GeneID
        GeneID object of the gene to be compared against, i.e. against its amino acid sequence.
    
    Returns
    -------
    List[SSDB.PreMatch]
        List of Pre-Matches, containing gene IDs of paralogs, and other data necessary for sequence matching. Will be empty, if nothing is found.
    
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    # download list of paralogs
    data = _downloadHomologs(geneID.geneIDString, geneID.organismAbbreviation)
    
    # parse HTML
    orthologData = _parseSsdbOrthologView(data)
    if orthologData is None:
        print( 'Failed to get SSDB data for ' + str(geneID))
        return None
    
    searchedSequenceLength, foundGenes = orthologData 
    
    # remove the gene that was searched for
    for preMatch in foundGenes:
        if preMatch.foundGeneIdString == geneID.geneIDString:
            foundGenes.remove(preMatch)
            break
    
    return (searchedSequenceLength, foundGenes)

@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def _downloadHomologs(geneIdString, organismAbbreviationString):
    return str(urllib.request.urlopen('https://www.kegg.jp/ssdb-bin/ssdb_ortholog_view?org_gene=' + geneIdString + '&org=' + organismAbbreviationString, timeout=settings.downloadTimeoutSocket).read()).replace('\\n', '')

AA_SEQ_LENGTH_REGEX_PATTERN = re.compile('\(([0-9]+) a\.a\.\)')
NT_SEQ_LENGTH_REGEX_PATTERN = re.compile('\(([0-9]+) n\.t\.\)') # length in AA == length in NT / 3 - 1

def _parseSsdbOrthologView(htmlString) -> Tuple[int, List[SSDB.PreMatch]]:
    
    try:
        html = BeautifulSoup(htmlString, 'html.parser')
        searchedSequenceLengthMatch = AA_SEQ_LENGTH_REGEX_PATTERN.search(html.body.a.next_sibling)
        if searchedSequenceLengthMatch is None: # length in amino acids not found, maybe it is given in nucleic acids
            searchedSequenceLengthMatch = NT_SEQ_LENGTH_REGEX_PATTERN.search(html.body.a.next_sibling)
        searchedSequenceLength = int(searchedSequenceLengthMatch.group(1))
        
        matches = []
        
        for index, tr in enumerate( html.table.children ):
            
            # ignore head of table
            if index == 0:
                continue
            
            for index2, td in enumerate( tr.children ):
                 
                if index2 == 0: # read gene ID
                    foundGeneIdString = td.text
                     
                elif index2 == 1: # read Smith-Waterman score
                    swScore = int(td.text)
                     
                elif index2 == 2: # read bit score
                    bitScore = float(td.text)
                     
                elif index2 == 3: # read identity
                    identity = float(td.text)
                     
                elif index2 == 4: # read overlap
                    overlap = int(td.text)
            
            matches.append( SSDB.PreMatch(foundGeneIdString, swScore, bitScore, identity, overlap) )
        
        return (searchedSequenceLength, matches)
    
    except BaseException as _:
        return None




[docs]def downloadOrthologOverview(geneID: 'GeneID') -> Tuple[int, List[SSDB.Match]]:
    """
    Download overview of orthologs of gene `geneID` found in any organism.
    
    Parameters
    ----------
    geneID : GeneID
        GeneID object of the gene to be compared against, i.e. against its amino acid sequence.
    
    Returns
    -------
    List[SSDB.Match]
        List of Matches, containing gene IDs of orthologs, and other data necessary for sequence matching. Will be empty, if nothing is found.
    
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    # download list of orthologs
    data = _downloadOrthologOverview(geneID.geneIDString)
    
    # parse HTML
    try:
        foundGenes = _parseSsdbBestView(data)
    except (IndexError, ValueError, AttributeError):
        print("\nError when parsing ortholog overview of gene: " + str(geneID))
        return None
    
    return foundGenes

@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def _downloadOrthologOverview(geneIdString):
    return str(urllib.request.urlopen('https://www.kegg.jp/ssdb-bin/ssdb_best_best?threshold=400&org_gene=' + geneIdString, timeout=settings.downloadTimeoutSocket).read()).replace('\\n', '')

SSDB_OVERVIEW_REGEX = re.compile("\)\s*|\s*[\(]{0,1}\s*")

def _parseSsdbBestView(htmlString) -> Tuple[int, List[SSDB.Match]]:
    
    html = BeautifulSoup(htmlString.replace('&#', ''), 'html.parser')
    searchedSequenceLengthMatch = AA_SEQ_LENGTH_REGEX_PATTERN.search(html.table.tr.text)
    if searchedSequenceLengthMatch is None: # length in amino acids not found, maybe it is given in nucleic acids
        searchedSequenceLengthMatch = NT_SEQ_LENGTH_REGEX_PATTERN.search(html.table.tr.text)
    searchedSequenceLength = int(searchedSequenceLengthMatch.group(1))
    
    matches = []
    
    content = str(html.pre).split('<input')
    content.pop(0) # ignore first line, containing table heading
    
    for line in content:
        line = '<input' + line
        #print(line)
        lineHtml = BeautifulSoup(line, 'html.parser')
        #print(lineHtml)
        foundGeneIdString = lineHtml.input['value']
        
#         textFields = lineHtml.text.split()#.replace('&', '').replace('#', '')
        textFields = SSDB_OVERVIEW_REGEX.split(lineHtml.text)
#         print(textFields)
        
        try:
            length = int(textFields[-8])
            swScore = int(textFields[-7])
            bitScore = float(textFields[-5])
            identity = float(textFields[-4])
            overlap= int(textFields[-3])
        except (IndexError, ValueError):
            print(line)
            raise
        
        try:
            match = SSDB.Match(foundGeneIdString, swScore, bitScore, identity, overlap, length)
            matches.append( match )
        except ValueError: # ignore GeneID which is not correctly formatted. Can happen when it is an incomplete "Addendum" organism, or a virus, etc.
            continue        
    
    return (searchedSequenceLength, matches)
    






[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def downloadTaxonomyNCBI() -> str:
    """
    Download NCBI taxonomy from KEGG BRITE.
    
    Returns
    -------
    str
        NCBI taxonomy in special text format.
    
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    return REST.kegg_get('br:br08610', timeout=settings.downloadTimeoutSocket).read()


[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def downloadTaxonomyKEGG():
    """
    Download KEGG taxonomy from KEGG BRITE.
    
    Returns
    -------
    str
        KEGG taxonomy in special text format.
    
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    return REST.kegg_get('br:br08601', timeout=settings.downloadTimeoutSocket).read()



[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadSubstance(substanceID):
    """
    Download a substance description file from KEGG, compound or glycan.
    
    Parameters
    ----------
    substanceID : str
        The ID string of the substance to download, e.g. 'C00084'.
    
    Returns
    -------
    str
        Content of the substance's description.
    
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    return REST.kegg_get(substanceID, timeout=settings.downloadTimeoutSocket).read()


[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadEcEnzyme(ecNumberID):
    """
    Download an enzyme description file from KEGG, defined by its EC number.
    
    Parameters
    ----------
    ecNumber : str
        The EC number string of the enzyme to download, e.g. '4.1.2.48'.
    
    Returns
    -------
    str
        Content of the EcEnzymes's description.
    
    Raises
    ------
    URLError
        If connection to KEGG fails.
    """
    return REST.kegg_get('ec:' + ecNumberID, timeout=settings.downloadTimeoutSocket).read()