Source code for FEV_KEGG.Statistics.SequenceComparison

from builtins import int
import math
from FEV_KEGG import settings


[docs]def getExpectationValue(bitScore: float, searchSequenceLength: int, foundSequenceLength: int, numberOfSequencesInDatabase: int) -> float: """ Returns the E-value of a single query in a sequence database. Parameters ---------- bitScore : float Comparison score, normalised to base 2 (bits). searchSequenceLength : int Length of the sequence that was the input of the query. foundSequenceLength : int Length of the sequence that was the result of the query. If you have several results, run this function for each of them individually. numberOfSequencesInDatabase : int Count of all sequences that could have potentially be found. For example, a search in all genes of eco would mean the count of all sequenced genes for eco -> numberOfSequencesInDatabase = 4,498. A search in all organisms, however, would mean the count of all sequenced genes in KEGG -> numberOfSequencesInDatabase = 25,632,969. Returns ------- float Statistical E-value (expectation value) for the occurence of a match of the same confidence with a totally unrelated, e.g. random, sequence. """ return numberOfSequencesInDatabase * searchSequenceLength * foundSequenceLength * math.pow(2, -bitScore)
[docs]def isMatchSignificant(bitScore: float, searchSequenceLength: int, foundSequenceLength: int, numberOfSequencesInDatabase: int, significanceThreshold: float = settings.defaultEvalue) -> bool: """ Check if a sequence match is significant. Calculates E-value, using :func:`getExpectationValue`. If E-value is smaller than `significanceThreshold`, match is significant, return True. Parameters ---------- bitScore : float Comparison score, normalised to base 2 (bits). searchSequenceLength : int Length of the sequence that was the input of the query. foundSequenceLength : int Length of the sequence that was the result of the query. If you have several results, run this function for each of them individually. numberOfSequencesInDatabase : int Count of all sequences that could have potentially be found. For example, a search in all genes of eco would mean the count of all sequenced genes for eco -> numberOfSequencesInDatabase = 4,498. A search in all organisms, however, would mean the count of all sequenced genes in KEGG -> numberOfSequencesInDatabase = 25,632,969. significanceThreshold : float, optional Threshold of E-value below which to consider a match significant. Returns ------- bool Whether a sequence match is significant. """ if getExpectationValue(bitScore, searchSequenceLength, foundSequenceLength, numberOfSequencesInDatabase) <= significanceThreshold: return True else: return False