from builtins import int
import urllib
from FEV_KEGG.lib.Biopython.KEGG import REST
from bs4 import BeautifulSoup
from retrying import retry
import tqdm
from typing import List, Tuple
from FEV_KEGG.Util import Util, Parallelism
from FEV_KEGG.KEGG import SSDB
import FEV_KEGG.settings as settings
import concurrent.futures
import re
[docs]def is_not_404(exception):
"""
Checks if `exception` is **not** an HTTP 404 error.
This is useful, because for most types of downloads 404 means 'does not exist' and is in itself a valid reply. Therefore, retrying a download because it raised a 404 error is usually unnecessary.
Parameters
----------
exception : Exception
Any error class instance. Must be of class :class:`urllib.error.HTTPError` to be recognised as a 404 error.
Returns
-------
bool
*False*, if `exception` is an HTTP 404 error. *True*, if it is anythign else.
"""
return not ( isinstance(exception, urllib.error.HTTPError) and exception.code == 404 )
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404)
def downloadPathwayList(organismString: 'eco') -> str:
"""
Downloads list of all pathways for a given organism from KEGG.
Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
Parameters
----------
organismString : str
Abbreviation of the organism, e.g. 'eco'.
Returns
-------
str
List of pathways, delimited by '\\\\n'.
Raises
------
HTTPError
If pathway list does not exist.
URLError
If connection to KEGG fails.
"""
return REST.kegg_list('pathway', organismString, timeout=settings.downloadTimeoutSocket).read()
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadPathway(organismString: 'eco', pathwayName: '00260') -> str:
"""
Downloads pathway as KGML for a given organism from KEGG.
Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
Parameters
----------
organismString : str
Abbreviation of the organism, e.g. 'eco'.
pathwayName : str
Name of the pathway, e.g. '00260'. Will be automatically concatenated with `organismString` to form the pathway ID, e.g. 'eco:00260'.
Returns
-------
str
Pathway in KGML format.
Raises
------
HTTPError
If pathway does not exist.
URLError
If connection to KEGG fails.
"""
return REST.kegg_get(organismString + pathwayName, 'kgml', timeout=settings.downloadTimeoutSocket).read()
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def downloadGene(geneID: 'eco:b0004') -> str:
"""
Downloads gene description for a given gene ID (includes organism) from KEGG.
Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
Parameters
----------
geneID : str
ID of the gene, including organism abbreviation, e.g. 'eco:b0004'.
Returns
-------
str
Gene in KEGG GENE format.
Raises
------
HTTPError
If gene does not exist.
URLError
If connection to KEGG fails.
"""
result = REST.kegg_get(geneID, timeout=settings.downloadTimeoutSocket).read()
if len( result ) < 3:
raise urllib.error.HTTPError( "Download too small:\n" + result)
else:
return result
[docs]def downloadGeneBulk(geneIDs: '[eco:b0004, eco:b0015,...]') -> str:
"""
Downloads gene descriptions for a given list of gene IDs (includes organism) from KEGG
Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
Parameters
----------
geneIDs : Iterable
IDs of the genes, including organism abbreviation, e.g. '[eco:b0004, eco:b0015,...]'.
Returns
-------
str
Genes in KEGG GENE format, delimited by a line of '///'. You will have to split them! Order is arbitrary.
Raises
------
IOError
If result is too small. Possibly because none of the genes exist.
URLError
If connection to KEGG fails.
"""
max_query_count = 10 # hard limit imposed by KEGG server
# split list of GeneIDs into chunks of size max_query_count
geneIDs_chunks = Util.chunks(geneIDs, max_query_count)
# form sub-queries
query_parts = []
for chunk in geneIDs_chunks:
query_parts.append( '+'.join(chunk) )
tqdmPosition = Parallelism.getTqdmPosition()
threadPool = concurrent.futures.ThreadPoolExecutor(Parallelism.getNumberOfThreadsDownload())
futures = []
iterator = None
try:
# query KEGG in parallel
for query_part in query_parts:
futures.append( threadPool.submit(_downloadGeneBulk, query_part) )
iterator = concurrent.futures.as_completed(futures)
if settings.verbosity >= 1:
if settings.verbosity >= 2:
print( 'Downloading ' + str(len(geneIDs)) + ' genes, max. ' + str(max_query_count) + ' per chunk...' )
iterator = tqdm.tqdm(iterator, total = len(query_parts), unit = ' *10 genes', position = tqdmPosition)
result = ''
for future in iterator:
try:
result_part = future.result()
except concurrent.futures.CancelledError:
continue
result += result_part
threadPool.shutdown(wait = False)
except KeyboardInterrupt: # only raised in main thread (once in each process!)
Parallelism.keyboardInterruptHandler(threadPool=threadPool, threadPoolFutures=futures, terminateProcess=True)
raise
except BaseException:
if Parallelism.isMainThread():
Parallelism.keyboardInterruptHandler(threadPool=threadPool, threadPoolFutures=futures, silent=True)
raise
finally:
if iterator is not None: iterator.close()
return result
@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def _downloadGeneBulk(query_part):
if Parallelism.getShallCancelThreads() is True:
raise concurrent.futures.CancelledError()
else:
result = REST.kegg_get(query_part, timeout=settings.downloadTimeoutSocket).read()
if len( result ) < 3:
raise IOError( "Download too small:\n" + result)
else:
return result
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def downloadOrganismList() -> str:
"""
Download the list of all organisms known to KEGG.
Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
Returns
-------
str
List of organism descriptions known to KEGG, delimited by '\\\\n'.
Raises
------
URLError
If connection to KEGG fails.
"""
return REST.kegg_list('organism', timeout=settings.downloadTimeoutSocket).read()
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadEnzymeEcNumbers(enzymeAbbreviation) -> str:
"""
Download the list of all EC numbers for a given enzyme, identified by its abbreviation, from KEGG.
Also works for everything else in the description of an enzyme, not just the abbreviation.
Tries several times before giving up, see :attr:`FEV_KEGG.settings.retryDownloadBackoffFactor`.
Parameters
----------
enzymeAbbreviation : str
Common abbreviation of the desired enzyme, as it appears in its description, e.g. 'MiA'. Also works for everything else in the description of an enzyme, not just the abbreviation.
Returns
-------
str
EC numbers, delimited by '\\\\n'.
Raises
------
URLError
If connection to KEGG fails.
"""
ecNumbers = []
# look up enzyme EC numbers
searchResult = REST.kegg_find('enzyme', enzymeAbbreviation, timeout=settings.downloadTimeoutSocket).read().split('\n')
for line in searchResult:
if len( line ) < 10:
continue
ecNumber = line.split('\t')[0].split(':')[1]
ecNumbers.append(ecNumber)
return '\n'.join(ecNumbers)
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadOrganismInfo(organismAbbreviation) -> str:
"""
Downloads the info file of an organism.
Parameters
----------
organismAbbreviation : str
Abbreviation of the organism to check, e.g. 'eco'.
Returns
-------
str
Raw organism info. *None*, if download was empty (400 Bad Request), because this organism does not exist.
Raises
------
URLError
If connection to KEGG fails.
"""
try:
return REST.kegg_info(organismAbbreviation, timeout=settings.downloadTimeoutSocket).read()
except urllib.error.HTTPError as e:
if isinstance(e, urllib.error.HTTPError) and e.code == 400:
return None
else:
raise
[docs]def downloadOrthologs(geneID: 'GeneID', comparisonOrganismString: 'eco') -> Tuple[int, List[SSDB.PreMatch]]:
"""
Download orthologs of gene `geneID` found in organism `comparisonOrganismString`.
Parameters
----------
geneID : GeneID
GeneID object of the gene to be compared against, i.e. against its amino acid sequence.
comparisonOrganismString : str
Abbreviation of the organism to search for orthologs of `geneID`.
Returns
-------
List[SSDB.PreMatch]
List of Pre-Matches, containing gene IDs of orthologs, and other data necessary for sequence matching. Will be empty, if nothing is found.
Raises
------
URLError
If connection to KEGG fails.
"""
# download list of orthologs
data = _downloadHomologs(geneID.geneIDString, comparisonOrganismString)
# parse HTML
orthologData = _parseSsdbOrthologView(data)
if orthologData is None:
print( 'Failed to get SSDB data for ' + str(geneID) + ' in ' + comparisonOrganismString)
return None
foundGenes = orthologData
return foundGenes
[docs]def downloadParalogs(geneID: 'GeneID') -> Tuple[int, List[SSDB.PreMatch]]:
"""
Download paralogs of gene `geneID`.
Parameters
----------
geneID : GeneID
GeneID object of the gene to be compared against, i.e. against its amino acid sequence.
Returns
-------
List[SSDB.PreMatch]
List of Pre-Matches, containing gene IDs of paralogs, and other data necessary for sequence matching. Will be empty, if nothing is found.
Raises
------
URLError
If connection to KEGG fails.
"""
# download list of paralogs
data = _downloadHomologs(geneID.geneIDString, geneID.organismAbbreviation)
# parse HTML
orthologData = _parseSsdbOrthologView(data)
if orthologData is None:
print( 'Failed to get SSDB data for ' + str(geneID))
return None
searchedSequenceLength, foundGenes = orthologData
# remove the gene that was searched for
for preMatch in foundGenes:
if preMatch.foundGeneIdString == geneID.geneIDString:
foundGenes.remove(preMatch)
break
return (searchedSequenceLength, foundGenes)
@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def _downloadHomologs(geneIdString, organismAbbreviationString):
return str(urllib.request.urlopen('https://www.kegg.jp/ssdb-bin/ssdb_ortholog_view?org_gene=' + geneIdString + '&org=' + organismAbbreviationString, timeout=settings.downloadTimeoutSocket).read()).replace('\\n', '')
AA_SEQ_LENGTH_REGEX_PATTERN = re.compile('\(([0-9]+) a\.a\.\)')
NT_SEQ_LENGTH_REGEX_PATTERN = re.compile('\(([0-9]+) n\.t\.\)') # length in AA == length in NT / 3 - 1
def _parseSsdbOrthologView(htmlString) -> Tuple[int, List[SSDB.PreMatch]]:
try:
html = BeautifulSoup(htmlString, 'html.parser')
searchedSequenceLengthMatch = AA_SEQ_LENGTH_REGEX_PATTERN.search(html.body.a.next_sibling)
if searchedSequenceLengthMatch is None: # length in amino acids not found, maybe it is given in nucleic acids
searchedSequenceLengthMatch = NT_SEQ_LENGTH_REGEX_PATTERN.search(html.body.a.next_sibling)
searchedSequenceLength = int(searchedSequenceLengthMatch.group(1))
matches = []
for index, tr in enumerate( html.table.children ):
# ignore head of table
if index == 0:
continue
for index2, td in enumerate( tr.children ):
if index2 == 0: # read gene ID
foundGeneIdString = td.text
elif index2 == 1: # read Smith-Waterman score
swScore = int(td.text)
elif index2 == 2: # read bit score
bitScore = float(td.text)
elif index2 == 3: # read identity
identity = float(td.text)
elif index2 == 4: # read overlap
overlap = int(td.text)
matches.append( SSDB.PreMatch(foundGeneIdString, swScore, bitScore, identity, overlap) )
return (searchedSequenceLength, matches)
except BaseException as _:
return None
[docs]def downloadOrthologOverview(geneID: 'GeneID') -> Tuple[int, List[SSDB.Match]]:
"""
Download overview of orthologs of gene `geneID` found in any organism.
Parameters
----------
geneID : GeneID
GeneID object of the gene to be compared against, i.e. against its amino acid sequence.
Returns
-------
List[SSDB.Match]
List of Matches, containing gene IDs of orthologs, and other data necessary for sequence matching. Will be empty, if nothing is found.
Raises
------
URLError
If connection to KEGG fails.
"""
# download list of orthologs
data = _downloadOrthologOverview(geneID.geneIDString)
# parse HTML
try:
foundGenes = _parseSsdbBestView(data)
except (IndexError, ValueError, AttributeError):
print("\nError when parsing ortholog overview of gene: " + str(geneID))
return None
return foundGenes
@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def _downloadOrthologOverview(geneIdString):
return str(urllib.request.urlopen('https://www.kegg.jp/ssdb-bin/ssdb_best_best?threshold=400&org_gene=' + geneIdString, timeout=settings.downloadTimeoutSocket).read()).replace('\\n', '')
SSDB_OVERVIEW_REGEX = re.compile("\)\s*|\s*[\(]{0,1}\s*")
def _parseSsdbBestView(htmlString) -> Tuple[int, List[SSDB.Match]]:
html = BeautifulSoup(htmlString.replace('&#', ''), 'html.parser')
searchedSequenceLengthMatch = AA_SEQ_LENGTH_REGEX_PATTERN.search(html.table.tr.text)
if searchedSequenceLengthMatch is None: # length in amino acids not found, maybe it is given in nucleic acids
searchedSequenceLengthMatch = NT_SEQ_LENGTH_REGEX_PATTERN.search(html.table.tr.text)
searchedSequenceLength = int(searchedSequenceLengthMatch.group(1))
matches = []
content = str(html.pre).split('<input')
content.pop(0) # ignore first line, containing table heading
for line in content:
line = '<input' + line
#print(line)
lineHtml = BeautifulSoup(line, 'html.parser')
#print(lineHtml)
foundGeneIdString = lineHtml.input['value']
# textFields = lineHtml.text.split()#.replace('&', '').replace('#', '')
textFields = SSDB_OVERVIEW_REGEX.split(lineHtml.text)
# print(textFields)
try:
length = int(textFields[-8])
swScore = int(textFields[-7])
bitScore = float(textFields[-5])
identity = float(textFields[-4])
overlap= int(textFields[-3])
except (IndexError, ValueError):
print(line)
raise
try:
match = SSDB.Match(foundGeneIdString, swScore, bitScore, identity, overlap, length)
matches.append( match )
except ValueError: # ignore GeneID which is not correctly formatted. Can happen when it is an incomplete "Addendum" organism, or a virus, etc.
continue
return (searchedSequenceLength, matches)
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def downloadTaxonomyNCBI() -> str:
"""
Download NCBI taxonomy from KEGG BRITE.
Returns
-------
str
NCBI taxonomy in special text format.
Raises
------
URLError
If connection to KEGG fails.
"""
return REST.kegg_get('br:br08610', timeout=settings.downloadTimeoutSocket).read()
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax)
def downloadTaxonomyKEGG():
"""
Download KEGG taxonomy from KEGG BRITE.
Returns
-------
str
KEGG taxonomy in special text format.
Raises
------
URLError
If connection to KEGG fails.
"""
return REST.kegg_get('br:br08601', timeout=settings.downloadTimeoutSocket).read()
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadSubstance(substanceID):
"""
Download a substance description file from KEGG, compound or glycan.
Parameters
----------
substanceID : str
The ID string of the substance to download, e.g. 'C00084'.
Returns
-------
str
Content of the substance's description.
Raises
------
URLError
If connection to KEGG fails.
"""
return REST.kegg_get(substanceID, timeout=settings.downloadTimeoutSocket).read()
[docs]@retry(wait_exponential_multiplier=settings.retryDownloadBackoffFactor, wait_exponential_max=settings.retryDownloadBackoffMax, stop_max_delay=settings.retryDownloadMax, retry_on_exception=is_not_404) # do not retry on HTTP error 404, raise immediately instead
def downloadEcEnzyme(ecNumberID):
"""
Download an enzyme description file from KEGG, defined by its EC number.
Parameters
----------
ecNumber : str
The EC number string of the enzyme to download, e.g. '4.1.2.48'.
Returns
-------
str
Content of the EcEnzymes's description.
Raises
------
URLError
If connection to KEGG fails.
"""
return REST.kegg_get('ec:' + ecNumberID, timeout=settings.downloadTimeoutSocket).read()