Communicated genes overrepresentation analysis

Performs the over-representation analysis of the direct and communicated gene sets using the Gene Ontology Biological Processes from the c5 collection in the Molecular Signatures Database.

import os
from json import dumps
import logging
import pandas as pd
import numpy as np
import math
import json
import copy

import matplotlib.pyplot as plt
from matplotlib import cm

import statsmodels.stats as sts

from CoRe import reader, fnGO

Read the gene sets and determine the list of unique genes.

total_genes = 17949
gene_set_cutoff = 373

Specify positive false discovery rate

fdr = 0.01

The GOBP gene sets are stored as a dict in json format.

current_directory = os.getcwd()

go_directory = current_directory + "/Examples/GO_sets"
os.chdir(go_directory)

f = open(('GO_BPs.json'))
GO_BPs = json.load(f)
f.close()

bp_names = list(GO_BPs.keys())

for bp in GO_BPs.keys():
    GO_BPs[bp] = pd.DataFrame(GO_BPs[bp])

print('Total gene sets: ',len(list(GO_BPs.keys())))

wf = open('BP_embedding.csv')
all_lines = wf.readlines()
wf.close()

embed_idx = {}

for l in all_lines:
    all_values = l.rstrip('\r\n').split(',')

    embed_idx[int(all_values[0])] = []

    for k in all_values[1:]:
        embed_idx[int(all_values[0])].append(int(k))


bp_names = pd.read_csv('GOBP_list.csv',header=None)[0].to_list()
Total gene sets:  7481

Read the two sets of genes.

PPI set: Genes that have direct protein-protein interactions with SARS-CoV-2 proteins. Total interaction set: Genes that have either direct interaction or receive communication from SARS-CoV-2 proteins.

selected_pathway = 'Immune System'
pathway_nametag = selected_pathway.replace(' ','_')

network_type = 'medium-PPI'
state_type = 'maxEnt'

data_directory = current_directory + "/Examples/"+pathway_nametag
os.chdir(data_directory)

direct_interaction_set = reader.read_interactions_for_GO('SARS_CoV2-'+pathway_nametag+'_interactions.json')
total_interaction_set = reader.read_interactions_for_GO(state_type+'-SARS_CoV2_total_'+pathway_nametag+'_'+network_type+'_interactions.json')

Identify the sources, or SARS-CoV-2 proteins, that are causing communication. We determine this if for a SARS-CoV-2 protein the number of affected genes is greater than the number of genes that have PPI.

sources = []

print(f"{'SARS-CoV-2 protein'+'':<20} {'Directly interacting proteins'+'':<30} {'Direct and communicated proteins'+'':<25}")

for k in total_interaction_set.keys():
    if len(total_interaction_set[k])>len(direct_interaction_set[k]):
        sources.append(k)
        print(f"{k:<20} {len(direct_interaction_set[k]):<30} {len(total_interaction_set[k]):<25}")
SARS-CoV-2 protein   Directly interacting proteins  Direct and communicated proteins
SARS-CoV2 Nsp2       2                              3
SARS-CoV2 Nsp7       9                              28
SARS-CoV2 Nsp12      1                              7
SARS-CoV2 Nsp13      1                              3
SARS-CoV2 Nsp14      2                              7
SARS-CoV2 ORF3a      1                              2
SARS-CoV2 M          2                              33
SARS-CoV2 ORF8       7                              46
SARS-CoV2 ORF9b      1                              3
SARS-CoV2 ORF9c      2                              5
SARS-CoV2 ORF10      2                              4
go_tags, go_names, p_values = fnGO.compute_p_values(sources,GO_BPs,total_interaction_set,total_genes)

Save the GOBP gene sets that have a nontrivial raw \(p\)-value.

json_obj = json.dumps(go_tags)

f = open(state_type+'-all_nontrivial_GOs.json','w')
f.write(json_obj)
f.close()

Multiple hypothesis testing using Benjamini-Hochberg (BH) correction

q_values = {}

for s_g in sources:
    go_tags[s_g], go_names[s_g], q_values[s_g] = fnGO.compute_q_values(p_values[s_g],go_names[s_g],go_tags[s_g],fdr)
go_idx = {}

for s in sources:
    go_idx[s] = []
    for go in go_tags[s]:
        go_idx[s].append(bp_names.index(go))
idx_to_remove = {}

for s in sources:
    idx_to_remove[s] = []

    k = 0

    for ii in go_idx[s]:
        idx_copy = copy.deepcopy(go_idx[s])
        idx_copy.remove(ii)

        d1 = pd.DataFrame(embed_idx[ii])
        d2 = pd.DataFrame(idx_copy)

        if len(d1)>0 and len(d2)>0:
            intersection = pd.merge(d1, d2, how='inner').drop_duplicates([0])
            len_intersection = int(intersection[0].count())

            if len_intersection>0:
                idx_to_remove[s].append(k)

        k += 1
for s_g in sources:
    if len(go_names[s_g])>0:
        idx_to_remove[s_g].reverse()

        q_list = q_values[s_g].tolist()

        for k in idx_to_remove[s_g]:
            go_names[s_g].pop(k)
            go_tags[s_g].pop(k)
            q_list.pop(k)

        q_values[s_g] = np.array(q_list)

Save all GOBP gene sets that are significant after the BH correction.

json_obj = json.dumps(go_names)

f = open(state_type+'-all_gos-afterBH.json','w')
f.write(json_obj)
f.close()
all_unique_go_names = []

for s in go_names.keys():
    all_unique_go_names += go_names[s]

all_unique_go_names = list(set(all_unique_go_names))

wf = open(state_type+'-all_unique_go_names.csv','w')

for g_n in all_unique_go_names:
    print(g_n,file=wf)

wf.close()
all_unique_go_tags = []

for s in go_tags.keys():
    all_unique_go_tags += go_tags[s]

all_unique_go_tags = list(set(all_unique_go_tags))

Options for selecting the number of GOBP gene sets for plotting.

Option 1: Plot all significant BPs. Option 2: Specify the top \(n\) significant BPs to select for each SARS-CoV-2 protein. Option 3: Specify a file containing a list of Gene Ontology Biological Processes to select.

plot_option = 'high-selected_gos-top3.csv'

if plot_option==np.inf:
    selected_gos = []

    for s in sources:
        for go,q in zip(go_names[s],q_values[s]):
            if go not in selected_gos:
                selected_gos.append(go)
elif type(plot_option)==int:
    selected_gos = []
    for s in sources:
        n_c = min(plot_option,len(go_names[s]))

        for go,q in zip(go_names[s][:n_c],q_values[s][:n_c]):
            if go not in selected_gos and q<=fdr:
                selected_gos.append(go)

    wf = open(state_type+'-selected_gos-top'+str(plot_option)+'.csv','w')

    for s in selected_gos:
        print(s,file=wf)

    wf.close()
else:
    selected_gos = pd.read_csv(plot_option,header=None)[0].to_list()

print(len(selected_gos))
22
total_q = (math.log10(1))*np.ones(shape=(len(selected_gos),len(sources)))
q_matrix = np.zeros(shape=(len(selected_gos),len(sources)))

for i in range(0,len(selected_gos)):
    for j in range(0,len(sources)):
        s_g = sources[j]

        try:
            k = go_names[s_g].index(selected_gos[i])

            if q_values[s_g][k]<fdr:
                total_q[i,j] = -math.log10(q_values[s_g][k])
            #else:
            #    total_q[i,j] = 1.0

            q_matrix[i,j] = q_values[s_g][k]

        except ValueError:
            pass

short_sars = []

for s in sources:
    short_sars.append(s.split(' ')[1])

np.savetxt(state_type+'-q_mat-H.csv',q_matrix,delimiter=',')
c_max = 10#min(np.max(total_q),10.0)
r = c_max + math.log10(fdr)
c_min = -math.log10(fdr) - r
pixel_width = 0.5

l = pixel_width*len(selected_gos)
#w = 0.55*pixel_width*len(list(total_interaction_set))
#print(l,w)
#print(np.min(total_q))

non_zero_short_sars = []

non_zero_q = np.zeros(shape=total_q.shape)

for i in range(total_q.shape[1]):
    #print(np.sum(total_q[:,i]))

    if np.sum(total_q[:,i])>0.0:
        non_zero_short_sars.append(short_sars[i])

        if np.sum(non_zero_q)==0.0:
            non_zero_q = total_q[:,i]
        else:
            non_zero_q = np.vstack((non_zero_q,total_q[:,i]))

print(non_zero_short_sars)

non_zero_q = np.transpose(non_zero_q)

print(non_zero_q.shape,total_q.shape)

w = 0.9*pixel_width*len(non_zero_short_sars)

print(w,l)
['Nsp7', 'Nsp12', 'Nsp13', 'Nsp14', 'M', 'ORF8', 'ORF9b', 'ORF9c', 'ORF10']
(22, 9) (22, 11)
4.05 11.0
fig, ax = plt.subplots(figsize=(w,l))

plt.imshow(non_zero_q,cmap=cm.Reds,aspect='auto',vmin=0,vmax=c_max)

cbar = plt.colorbar(fraction=0.05,pad=0.05)

ax.set_xticks(np.arange(0,len(non_zero_short_sars)))
ax.set_xticklabels(non_zero_short_sars,fontsize=20,rotation=90)

ax.set_yticks(np.arange(0,len(selected_gos)))
ax.set_yticklabels(selected_gos,fontsize=18)

cbar.set_label(r'$-\log_{10}$($Q$)',fontsize=18,rotation=-90,labelpad=20)
cbar.ax.tick_params(labelsize=18)

ax.set_xticks(np.arange(-.5, len(non_zero_short_sars), 1), minor=True)
ax.set_yticks(np.arange(-.5, len(selected_gos), 1), minor=True)

ax.grid(which='minor', color='black', linestyle='-', linewidth=1.5, alpha=0.5)
_images/output_30_1.png
go_tags, go_names, p_values = fnGO.compute_p_values(sources,GO_BPs,direct_interaction_set,total_genes)
q_values = {}

for s_g in sources:
    go_tags[s_g], go_names[s_g], q_values[s_g] = fnGO.compute_q_values(p_values[s_g],go_names[s_g],go_tags[s_g],fdr)
total_q = (-math.log10(1))*np.ones(shape=(len(selected_gos),len(sources)))

for i in range(0,len(selected_gos)):
    for j in range(0,len(sources)):
        s_g = sources[j]

        try:
            k = go_names[s_g].index(selected_gos[i])

            if q_values[s_g][k]<=fdr:
                total_q[i,j] = -math.log10(q_values[s_g][k])
            else:
                total_q[i,j] = 0.0

        except ValueError:
            pass

short_sars = []

for s in sources:
    short_sars.append(s.split(' ')[1])
c_max = 10
r = c_max + math.log10(fdr)
c_min = -math.log10(fdr) - r

print(np.min(total_q))

print(c_max,c_min,r)
-0.0
10 -6.0 8.0
pixel_width = 0.5

l = pixel_width*len(selected_gos)

non_zero_short_sars = []

non_zero_q = np.zeros(shape=total_q.shape)

for i in range(total_q.shape[1]):

    if np.sum(total_q[:,i])>0.0:
        non_zero_short_sars.append(short_sars[i])

        if np.sum(non_zero_q)==0.0:
            non_zero_q = total_q[:,i]
        else:
            non_zero_q = np.vstack((non_zero_q,total_q[:,i]))

print(non_zero_short_sars)

non_zero_q = np.transpose(non_zero_q)

print(non_zero_q.shape,total_q.shape)
w = 0.9*pixel_width*len(non_zero_short_sars)
['Nsp7', 'Nsp14', 'ORF8', 'ORF10']
(22, 4) (22, 11)
fig, ax = plt.subplots(figsize=(w,l))

plt.imshow(non_zero_q,cmap=cm.binary,aspect='auto',vmin=0,vmax=c_max)

cbar = plt.colorbar(fraction=0.05,pad=0.05)

ax.set_xticks(np.arange(0,len(non_zero_short_sars)))
ax.set_xticklabels(non_zero_short_sars,fontsize=20,rotation=90)

ax.set_yticks(np.arange(0,len(selected_gos)))
ax.set_yticklabels(selected_gos,fontsize=18)

cbar.set_label(r'$-\log_{10}$($Q$)',fontsize=18,rotation=-90,labelpad=40)
cbar.ax.tick_params(labelsize=18)

ax.set_xticks(np.arange(-.5, len(non_zero_short_sars), 1), minor=True)
ax.set_yticks(np.arange(-.5, len(selected_gos), 1), minor=True)

ax.grid(which='minor', color='black', linestyle='-', linewidth=1.5, alpha=0.5)
_images/output_36_0.png