Source code for FEV_KEGG.Experiments.22

"""
Context
-------
:mod:`19` found many EC numbers new to an example group of Enterobacteriales vs. the super group of Gammaproteobacteria.
"108/190 -> 56.8% of EC numbers in Enterobacteriales are new, compared to Gammaproteobacteria consensus"

Question
--------
Could this be due to incomplete data in KEGG? Because substance-ec graphs are intersected to form the consensus.
Does the same result occur when intersecting the set of EC numbers itself?

Method
------
Similar to :mod:`19`, only intersect sets instead of networks.
- Create a group of example organisms of Order Enterobacteriales.
- Create a group of example organisms of Class Gammaproteobacteria, including the same organsims as the group of Enterobacteriales.
- Get sets of EC numbers from each graph.
- Calculate consensus set for both groups (Order and Class). Leaving only EC numbers which occur in all organisms of the group.
- Calculate the difference of the two sets of consensus EC numbers, leaving only the EC numbers which occur in Enterobacteriales consensus, but not in Gammaproteobacteria consensus.
- Print these EC numbers and their percentage of all EC numbers in Enterobacteriales, ie. how many of the EC numbers in Enterobacteriales do not exist in Gammaproteobacteria consensus.

Result
------

::

    107/190 -> 56.3% of EC numbers in Enterobacteriales are new, compared to Gammaproteobacteria consensus

Conclusion
----------
Only one EC number (0.5%) was lost due to the way the consensus is calculated. This kind of error in KEGG data should not be able to cause much difference in results.
"""
import FEV_KEGG.KEGG.Organism
from FEV_KEGG.Statistics.Percent import getPercentSentence


[docs]def enterobacterialesEcSets():
    #- Create a group of example organisms of Order Enterobacteriales.
    enterobacteriales_organisms_abbreviations = ['eco', 'ses', 'sfl', 'ent', 'esa', 'kpn', 'cko', 'ype', 'spe', 'buc']
    
    enterobacteriales_organisms = FEV_KEGG.KEGG.Organism.Group(organismAbbreviations = enterobacteriales_organisms_abbreviations)
    
    ecNumberSets = []
    
    for graph in enterobacteriales_organisms.ecGraphs().values():
        ecNumberSets.append(graph.getECs())
    
    return ecNumberSets


[docs]def gammaproteobacteriaEcSets():
    #- Create a group of example organisms of Class Gammaproteobacteria, including the same organsims as the group of Enterobacteriales.
    enterobacteriales_organisms_abbreviations = ['eco', 'ses', 'sfl', 'ent', 'esa', 'kpn', 'cko', 'ype', 'spe', 'buc']
    gammaproteobacteria_organisms_abbreviations = ['hin', 'mht', 'xcc', 'vch', 'pae', 'acb', 'son', 'pha', 'amc', 'lpn', 'ftu', 'aha']
    gammaproteobacteria_organisms_abbreviations.extend(enterobacteriales_organisms_abbreviations) # extend with the sub-set, because they are also part of the set
    
    gammaproteobacteria_organisms = FEV_KEGG.KEGG.Organism.Group(organismAbbreviations = gammaproteobacteria_organisms_abbreviations)
    
    ecNumberSets = []
    
    for graph in gammaproteobacteria_organisms.ecGraphs().values():
        ecNumberSets.append(graph.getECs())
    
    return ecNumberSets
    

if __name__ == '__main__':
    
    #- Get sets of EC numbers from each graph.
    enterobacteriales_EC_setList = enterobacterialesEcSets()
    gammaproteobacteria_EC_setList = gammaproteobacteriaEcSets()
    
    #- Calculate consensus set for both groups (Order and Class). Leaving only EC numbers which occur in all organisms of the group.
    enterobacteriales_EC_set = enterobacteriales_EC_setList.pop()
    for ecNumberSet in enterobacteriales_EC_setList:
        enterobacteriales_EC_set.intersection_update(ecNumberSet)
        
    gammaproteobacteria_EC_set = gammaproteobacteria_EC_setList.pop()
    for ecNumberSet in gammaproteobacteria_EC_setList:
        gammaproteobacteria_EC_set.intersection_update(ecNumberSet)
    
    #- Calculate the difference of the two sets of consensus EC numbers, leaving only the EC numbers which occur in Enterobacteriales consensus, but not in Gammaproteobacteria consensus.
    only_enterobacteriales_EC_set = enterobacteriales_EC_set.difference(gammaproteobacteria_EC_set)
    
    #- Print these EC numbers and their percentage of all EC numbers in Enterobacteriales, ie. how many of the EC numbers in Enterobacteriales do not exist in Gammaproteobacteria consensus.
    output = []
    
    for ec in only_enterobacteriales_EC_set:
        output.append(ec.__str__())
    
    output.sort()
    print(str(len(output)) + ' results')
    for line in output:
        print(line)
        
    print( getPercentSentence(len(only_enterobacteriales_EC_set), len(enterobacteriales_EC_set)) + ' of EC numbers in Enterobacteriales are new, compared to Gammaproteobacteria consensus' )