Source code for ckg.analytics_core.analytics.wgcnaAnalysis

import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from collections import defaultdict
from natsort import natsorted

try:
    from rpy2 import robjects as ro
    from rpy2.robjects import pandas2ri
    from rpy2.robjects.packages import importr
    from rpy2.rinterface_lib import embedded
    pandas2ri.activate()
    # sys.setrecursionlimit(10000)

    #Call R
    R = ro.r
    R('options(stringsAsFactors = FALSE)')

    #Call R packages
    base = importr('base')
    stats = importr('stats')
    WGCNA = importr('WGCNA')
    flashClust = importr('flashClust')
except ImportError:
    print("WGCNA functions will not work. Module Rpy2 not installed.")
except Exception as err:
    print("WGCNA functions will not work. Missing installation. Error: {}".format(err))


[docs]def get_data(data, drop_cols_exp=['subject', 'group', 'sample', 'index'], drop_cols_cli=['subject', 'group', 'biological_sample', 'index'], sd_cutoff=0): """ This function cleanes up and formats experimental and clinical data into similarly shaped dataframes. :param dict data: dictionary with processed clinical and proteomics datasets. :param list drop_cols_exp: list of columns to drop from processed experimental (protemics/rna-seq/dna-seq) dataframe. :param list drop_cols_cli: list of columns to drop from processed clinical dataframe. :return: Dictionary with experimental and clinical dataframes (keys are the same as in the input dictionary). """ wgcna_data = {} for i in data: if data[i] is not None: df = data[i] if i == 'clinical': df.drop_duplicates(keep='first', inplace=True) df = df.reset_index() df['rows'] = df[['subject', 'group']].apply(lambda x: '_'.join(x), axis=1) df.set_index(['rows'], inplace=True) df = df.reindex(index=natsorted(df.index)) df = df.drop(drop_cols_cli, axis=1) else: df = df.reset_index() df['rows'] = df[['subject', 'group']].apply(lambda x: '_'.join(x), axis=1) df.set_index(['rows'], inplace=True) df = df.reindex(index=natsorted(df.index)) df = df.drop(drop_cols_exp, axis=1) if sd_cutoff > 0: df = df.loc[:, df.std() > sd_cutoff] wgcna_data[i] = df return wgcna_data
[docs]def get_dendrogram(df, labels, distfun='euclidean', linkagefun='ward', div_clusters=False, fcluster_method='distance', fcluster_cutoff=15): """ This function calculates the distance matrix and performs hierarchical cluster analysis on a set of dissimilarities and methods for analyzing it. :param df: pandas dataframe with samples/subjects as index and features as columns. :param list labels: labels for the leaves of the tree. :param str distfun: distance measure to be used ('euclidean', 'maximum', 'manhattan', 'canberra', 'binary', 'minkowski' or 'jaccard'). :param str linkagefun: hierarchical/agglomeration method to be used ('single', 'complete', 'average', 'weighted', 'centroid', 'median' or 'ward'). :param bool div_clusters: dividing dendrogram leaves into clusters (True or False). :param str fcluster_method: criterion to use in forming flat clusters. :param int fcluster_cutoff: maximum cophenetic distance between observations in each cluster. :return: Dictionary of data structures computed to render the dendrogram. Keys: 'icoords', 'dcoords', 'ivl' and 'leaves'. If div_clusters is used, it will also return a dictionary of each cluster and respective leaves. """ # np.random.seed(112736) if df is not None and len(labels) > 0: if distfun is None: dist = np.asarray(stats.as_dist(df)) else: dist = np.asarray(stats.dist(df, method=distfun)) Z = linkage(dist, method=linkagefun) Z_dendrogram = dendrogram(Z, no_plot=True, labels=labels) if div_clusters: clusters = get_clusters_elements(Z, fcluster_method, fcluster_cutoff, labels) return Z_dendrogram, clusters else: return Z_dendrogram return None
[docs]def get_clusters_elements(linkage_matrix, fcluster_method, fcluster_cutoff, labels): """ This function implements the generation of flat clusters from an hierarchical clustering with the same interface as scipy.cluster.hierarchy.fcluster. :param ndarray linkage_matrix: hierarchical clustering encoded with a linkage matrix. :param str fcluster_method: criterion to use in forming flat clusters ('inconsistent', 'distance', 'maxclust', 'monocrit', 'maxclust_monocrit'). :param float fcluster_cutoff: maximum cophenetic distance between observations in each cluster. :param list labels: labels for the leaves of the dendrogram. :return: A dictionary where keys are the cluster numbers and values are the dendrogram leaves. """ clust = fcluster(linkage_matrix, fcluster_cutoff, fcluster_method) clusters = defaultdict(list) for i, j in zip(clust, labels): clusters[i].append(j) return clusters
[docs]def filter_df_by_cluster(df, clusters, number): """ Select only the members of a defined cluster. :param df: pandas dataframe with samples/subjects as index and features as columns. :param dict clusters: clusters dictionary from get_dendrogram function if div_clusters option was True. :param int number: cluster number (key). :return: Pandas dataframe with all the features (columns) and samples/subjects belonging to the defined cluster (index). """ return df[df.index.isin(clusters[number])]
[docs]def df_sort_by_dendrogram(df, Z_dendrogram): """ Reorders pandas dataframe by index and according to the dendrogram list of leaf nodes labels. :param df: pandas dataframe with the labels to be reordered as index. :param dict Z_dendrogram: dictionary of data structures computed to render the dendrogram. Keys: 'icoords', 'dcoords', 'ivl' and 'leaves'. :return: Reordered pandas dataframe. """ data = df.copy() data.index = pd.CategoricalIndex(data.index, categories=Z_dendrogram['ivl']) data.sort_index(level=0, inplace=True) return data
[docs]def get_percentiles_heatmap(df, Z_dendrogram, bydendro= True, bycols=False): """ This function transforms the absolute values in each row or column (option 'bycols') into relative values. :param df: pandas dataframe with samples/subjects as index and features as columns. :param dict Z_dendrogram: dictionary of data structures computed to render the dendrogram. Keys: 'icoords', 'dcoords', 'ivl' and 'leaves'. :param bool bydendro: if labels should be ordered according to dendrogram list of leaf nodes labels set to True, otherwise set to False. :param bool bycols: relative values calculated across rows (samples) then set to False. Calculation performed across columns (features) set to True. :return: Pandas dataframe. """ if bydendro: df2 = df_sort_by_dendrogram(df, Z_dendrogram) else: df2 = df p = pd.DataFrame(index=df2.index, columns=df2.columns) if bycols: for j in df2.index: for i in df2.columns: pct = (df2.loc[j,i] - np.nanmin(df2.loc[j,:])) / ((np.nanmax(df2.loc[j, :]) - np.nanmin(df2.loc[j, :])) * 1.) pct = pct - (pct - 0.5) * 1. / 40 #have to rescale it to account for endpoints of cmaps p.loc[j,i] = pct else: for i in df2.index: for j in df2.columns: pct = (df2.loc[i,j] - np.nanmin(df2.loc[:,j])) / ((np.nanmax(df2.loc[:,j]) - np.nanmin(df2.loc[:,j])) * 1.) pct = pct - (pct - 0.5) * 1. / 40 #have to rescale it to account for endpoints of cmaps p.loc[i,j] = pct return p
[docs]def get_miss_values_df(data): """ Proccesses pandas dataframe so missing values can be plotted in heatmap with specific color. :param data: pandas dataframe. :return: Pandas dataframe with missing values as integer 1, and originally valid values as NaN. """ df = data.copy() df = df.isnull().astype('int') df = df.replace(0, np.nan) return df
[docs]def paste_matrices(matrix1, matrix2, rows, cols): """ Takes two matrices with analog shapes and concatenates each value in matrix 1 with corresponding one in matrix 2, returning a single pandas dataframe. :param ndarray matrix1: input 1 :param ndarray matrix2: input 2 :return: Pandas dataframe. """ #a = pandas2ri.ri2py(matrix1) #b = pandas2ri.ri2py(matrix2) text = [] for i, j in zip(matrix1, matrix2): for x, y in zip(i, j): text.append(('{:0.2}<br>{:.0e}'.format(x, y))) text = np.array(text) text.shape = (matrix1.shape[0], matrix1.shape[1]) textMatrix = pd.DataFrame(text, index=rows, columns=cols) return textMatrix
[docs]def cutreeDynamic(distmatrix, linkagefun='average', minModuleSize=50, method='hybrid', deepSplit=2, pamRespectsDendro=False, distfun=None): """ This function implements the R cutreeDynamic wrapper in Python, provinding an access point for methods of adaptive branh pruning of hierarchical clustering dendrograms. :param data: pandas dataframe. :param str distfun: distance measure to be used ('euclidean', 'maximum', 'manhattan', 'canberra', 'binary', 'minkowski' or 'jaccard'). :param str linkagefun: hierarchical/agglomeration method to be used ('single', 'complete', 'average', 'weighted', 'centroid', 'median' or 'ward'). :param int minModuleSize: minimum module size. :param str method: method to use ('hybrid' or 'tree'). :param int deepSplit: provides a rough control over sensitivity to cluster splitting, the higher the value (with 'hybrid' method) or if True (with 'tree' method), the more and smaller modules. :param bool pamRespectsDendro: only used for method 'hybrid'. Objects and small modules will only be assigned to modules that belong to the same branch in the dendrogram structure. :return: Numpy array of numerical labels giving assignment of objects to modules. Unassigned objects are labeled 0, the largest module has label 1, next largest 2 etc. """ #if distfun is None: # dist = stats.as_dist(distmatrix) #else: # dist = stats.dist(distmatrix, method = distfun) R_function = R(''' clusters <- function(distmatrix, linkagefun, method, minModuleSize, deepSplit, pamRespectsDendro) { cutreeDynamic(dendro=flashClust(as.dist(distmatrix),method=linkagefun), distM=distmatrix, method=method, deepSplit=deepSplit, pamRespectsDendro=pamRespectsDendro, minClusterSize=minModuleSize)} ''') cutree = R_function(distmatrix, linkagefun=linkagefun, method=method, minModuleSize=minModuleSize, deepSplit=deepSplit, pamRespectsDendro=pamRespectsDendro) return np.array(cutree)
[docs]def build_network(data, softPower=6, networkType='unsigned', linkagefun='average', method='hybrid', minModuleSize=50, deepSplit=2, pamRespectsDendro=False, merge_modules=True, MEDissThres=0.4, verbose=0): """ Weighted gene network construction and module detection. Calculates co-expression similarity and adjacency, topological overlap matrix (TOM) and clusters features in modules. :param data: pandas dataframe containing experimental data, with samples/subjects as rows and features as columns. :param int softPower: soft-thresholding power. :param str networkType: network type ('unsigned', 'signed', 'signed hybrid', 'distance'). :param str linkagefun: hierarchical/agglomeration method to be used ('single', 'complete', 'average', 'weighted', 'centroid', 'median' or 'ward'). :param str method: method to use ('hybrid' or 'tree'). :param int minModuleSize: minimum module size. :paran int deepSplit: provides a rough control over sensitivity to cluster splitting, the higher the value (with 'hybrid' method) or if True (with 'tree' method), the more and smaller modules. :param bool pamRespectsDendro: only used for method 'hybrid'. Objects and small modules will only be assigned to modules that belong to the same branch in the dendrogram structure. :param bool merge_modules: if True, very similar modules are merged. :param float MEDissThres: maximum dissimilarity (i.e., 1-correlation) that qualifies modules for merging. :param int verbose: integer level of verbosity. Zero means silent, higher values make the output progressively more and more verbose. :return: Tuple with TOM dissimilarity pandas dataframe, numpy array with module colors per experimental feature. """ #Calculate adjacencies adjacency = WGCNA.adjacency(data, power=softPower, type=networkType) #Transforms the adjacency into topological overlap matrix (TOM) TOM = WGCNA.TOMsimilarity(adjacency, verbose = verbose) #Calculates the corresponding dissimilarity matrix dissTOM = pd.DataFrame(R("1") - TOM) dissTOM.columns = data.columns dissTOM.index = data.columns #Identify co-expression modules moduleColors = identify_module_colors(dissTOM, linkagefun=linkagefun, method=method, minModuleSize=minModuleSize, deepSplit=deepSplit, pamRespectsDendro=pamRespectsDendro) #Merge modules whose expression profiles are very similar if merge_modules == True: MEs, moduleColors = merge_similar_modules(data, moduleColors, MEDissThres=MEDissThres, verbose=verbose) return dissTOM, moduleColors
[docs]def pick_softThreshold(data, RsquaredCut=0.8, networkType='unsigned', verbose=0): """ Analysis of scale free topology for multiple soft thresholding powers. Aids the user in choosing a proper soft-thresholding power for network construction. :param data: pandas dataframe containing experimental data, with samples/subjects as rows and features as columns. :param float RsquaredCut: desired minimum scale free topology fitting index R^2. :param str networkType: network type ('unsigned', 'signed', 'signed hybrid', 'distance'). :param int verbose: integer level of verbosity. Zero means silent, higher values make the output progressively more and more verbose. :return: Estimated appropriate soft-thresholding power: the lowest power for which the scale free topology fit R^2 exceeds RsquaredCut. :rtype: int """ powers = np.arange(1,20,1) sft = WGCNA.pickSoftThreshold(data, RsquaredCut=RsquaredCut, powerVector=powers, networkType=networkType, verbose=verbose) softPower = sft.rx2('powerEstimate')[0] return softPower
[docs]def identify_module_colors(matrix, linkagefun='average', method='hybrid', minModuleSize=30, deepSplit=2, pamRespectsDendro=False): """ Identifies co-expression modules and converts the numeric labels into colors. :param matrix: dissimilarity structure as produced by R.stats dist. :param int minModuleSize: minimum module size. :param int deepSplit: provides a rough control over sensitivity to cluster splitting, the higher the value (with 'hybrid' method) or if True (with 'tree' method), the more and smaller modules. :param bool pamRespectsDendro: only used for method 'hybrid'. Objects and small modules will only be assigned to modules that belong to the same branch in the dendrogram structure. :return: Numpy array of strings with module color of each experimental feature. """ dynamicMods = cutreeDynamic(matrix, linkagefun=linkagefun, method=method, minModuleSize=minModuleSize, deepSplit=deepSplit, pamRespectsDendro=pamRespectsDendro) dynamicColors= np.array(WGCNA.labels2colors(dynamicMods)) return dynamicColors
[docs]def calculate_module_eigengenes(data, modColors, softPower=6, dissimilarity=True): """ Calculates modules eigengenes to quantify co-expression similarity of entire modules. :param data: pandas dataframe containing experimental data, with samples/subjects as rows and features as columns. :param ndarray modColors: array (numeric, character or a factor) attributing module colors to each feature in the experimental dataframe. :param int softPower: soft-thresholding power. :param dissimilarity: calculates dissimilarity of module eigengenes. :return: Pandas dataframe with calculated module eigengenes. If dissimilarity is set to True, returns a tuple with two pandas dataframes, the first with the module eigengenes and the second with the eigengenes dissimilarity. """ MEs = pd.DataFrame() MEDiss = pd.DataFrame() try: MEList = WGCNA.moduleEigengenes(data, modColors, softPower=softPower) MEs0 = MEList.rx2('eigengenes') MEs = WGCNA.orderMEs(MEs0, verbose=0) if dissimilarity: MEcor = WGCNA.cor(MEs, verbose=0) MEcor = R_wrapper.R_matrix2Py_matrix(MEcor, MEcor.rownames, MEcor.colnames) MEDiss = 1 - MEcor return MEs, MEDiss else: return MEs, MEDiss except embedded.RRuntimeError as err: print(err) return MEs, MEDiss
[docs]def merge_similar_modules(data, modColors, MEDissThres=0.4, verbose=0): """ Merges modules in co-expression network that are too close as measured by the correlation of their eigengenes. :param data: pandas dataframe containing experimental data, with samples/subjects as rows and features as columns. :param ndarray modColors: array (numeric, character or a factor) attributing module colors to each feature in the experimental dataframe. :para, float MEDissThres: maximum dissimilarity (i.e., 1-correlation) that qualifies modules for merging. :param int verbose: integer level of verbosity. Zero means silent, higher values make the output progressively more and more verbose. :return: Tuple containing pandas dataframe with eigengenes of the new merged modules, and array with module colors of each expeirmental feature. """ mergedMEs = pd.DataFrame() mergedColors = [] try: merge = WGCNA.mergeCloseModules(data, modColors, cutHeight=MEDissThres, verbose=verbose) mergedColors = merge.rx2('colors') mergedMEs = merge.rx2('newMEs') except embedded.RRuntimeError as err: print(err) return mergedMEs, mergedColors
[docs]def calculate_ModuleTrait_correlation(df_exp, df_traits, MEs): """ Correlates eigengenes with external traits in order to identify the most significant module-trait associations. :param df_exp: pandas dataframe containing experimental data, with samples/subjects as rows and features as columns. :param df_traits: pandas dataframe containing clinical data, with samples/subjects as rows and clinical traits as columns. :param MEs: pandas dataframe with module eigengenes. :return: Tuple with two pandas datafames, first the correlation between all module eigengenes and all clinical traits, second a dataframe with concatenated correlation and p-value used for heatmap annotation. """ nSamples = len(df_exp.index) moduleTraitCor = None textMatrix = None df_traits.columns = df_traits.columns.str.replace(' ', 'space') df_traits.columns = df_traits.columns.str.replace('(', 'parentheses1') df_traits.columns = df_traits.columns.str.replace(')', 'parentheses2') common = list(set(MEs.index).intersection(df_traits.index)) if len(common) > 0: moduleTraitCor_r = WGCNA.cor(MEs.loc[common,:], df_traits.loc[common,:], use='p', verbose=0) moduleTraitPvalue_r = WGCNA.corPvalueStudent(moduleTraitCor_r, nSamples) textMatrix = paste_matrices(moduleTraitCor_r, moduleTraitPvalue_r, MEs.columns, df_traits.columns) moduleTraitCor = pd.DataFrame(moduleTraitCor_r, index=MEs.columns, columns=df_traits.columns) moduleTraitPvalue = pd.DataFrame(moduleTraitPvalue_r, index=MEs.columns, columns=df_traits.columns) moduleTraitCor.columns = moduleTraitCor.columns.str.replace('space', ' ') moduleTraitPvalue.columns = moduleTraitPvalue.columns.str.replace('space', ' ') textMatrix.columns = textMatrix.columns.str.replace('space', ' ') moduleTraitCor.columns = moduleTraitCor.columns.str.replace('parentheses1', '(') moduleTraitCor.columns = moduleTraitCor.columns.str.replace('parentheses2', ')') moduleTraitPvalue.columns = moduleTraitPvalue.columns.str.replace('parentheses1', '(') moduleTraitPvalue.columns = moduleTraitPvalue.columns.str.replace('parentheses2', ')') textMatrix.columns = textMatrix.columns.str.replace('parentheses1', '(') textMatrix.columns = textMatrix.columns.str.replace('parentheses2', ')') return moduleTraitCor, textMatrix
[docs]def calculate_ModuleMembership(data, MEs): """ For each module, calculates the correlation of the module eigengene and the feature expression profile (quantitative measure of module membership (MM)). :param data: pandas dataframe containing experimental data, with samples/subjects as rows and features as columns. :param MEs: pandas dataframe with module eigengenes. :return: Tuple with two pandas dataframes, one with module membership correlations and another with p-values. """ nSamples=len(data.index) data_r = data.copy() data_r.columns = data_r.columns.str.replace('~', 'dash') modLabels = [i[2:] for i in list(MEs.columns)] FeatureModuleMembership = base.as_data_frame(WGCNA.cor(data_r, MEs, use='p', verbose=0)) MMPvalue = base.as_data_frame(WGCNA.corPvalueStudent(base.as_matrix(FeatureModuleMembership), nSamples)) FeatureModuleMembership.columns = ['MM'+str(col) for col in modLabels] FeatureModuleMembership.index = data_r.columns MMPvalue.columns = ['p.MM'+str(col) for col in modLabels] MMPvalue.index = data_r.columns #FeatureModuleMembership = R_wrapper.R_matrix2Py_matrix(FeatureModuleMembership, FeatureModuleMembership.index, FeatureModuleMembership.columns) #MMPvalue = R_wrapper.R_matrix2Py_matrix(MMPvalue, MMPvalue.rownames, MMPvalue.colnames) FeatureModuleMembership.index = data_r.columns.str.replace('dash', '~') MMPvalue.index = MMPvalue.index.str.replace('dash', '~') return FeatureModuleMembership, MMPvalue
[docs]def calculate_FeatureTraitSignificance(df_exp, df_traits): """ Quantifies associations of individual experimental features with the measured clinical traits, by defining Feature Significance (FS) as the absolute value of the correlation between the feature and the trait. :param df_exp: pandas dataframe containing experimental data, with samples/subjects as rows and features as columns. :param df_traits: pandas dataframe containing clinical data, with samples/subjects as rows and clinical traits as columns. :return: Tuple with two pandas dataframes, one with feature significance correlations and another with p-values. """ nSamples=len(df_exp.index) df_exp_r = df_exp.copy() df_exp_r.columns = df_exp_r.columns.str.replace('~', 'dash') df_cli_r = df_traits.copy() df_cli_r.columns = df_cli_r.columns.str.replace(' ', 'space') df_cli_r.columns = df_cli_r.columns.str.replace('(', 'parentheses1') df_cli_r.columns = df_cli_r.columns.str.replace(')', 'parentheses2') common = list(set(df_exp_r.index).intersection(df_cli_r.index)) FeatureTraitSignificance = base.as_data_frame(WGCNA.cor(df_exp_r.loc[common,:], df_cli_r.loc[common,:], use='p', verbose=0)) FSPvalue = base.as_data_frame(WGCNA.corPvalueStudent(base.as_matrix(FeatureTraitSignificance), nSamples)) FeatureTraitSignificance.columns = ['GS.'+str(col) for col in df_cli_r.columns] FeatureTraitSignificance.index = df_exp_r.columns FSPvalue.columns = ['p.GS.'+str(col) for col in df_cli_r.columns] FSPvalue.index = df_exp_r.columns #FeatureTraitSignificance = R_wrapper.R_matrix2Py_matrix(FeatureTraitSignificance, FeatureTraitSignificance.rownames, FeatureTraitSignificance.colnames) #FSPvalue = R_wrapper.R_matrix2Py_matrix(FSPvalue, FSPvalue.rownames, FSPvalue.colnames) FeatureTraitSignificance.columns = FeatureTraitSignificance.columns.str.replace('space', ' ') FeatureTraitSignificance.columns = FeatureTraitSignificance.columns.str.replace('parentheses1', '(') FeatureTraitSignificance.columns = FeatureTraitSignificance.columns.str.replace('parentheses2', ')') FeatureTraitSignificance.index = FeatureTraitSignificance.index.str.replace('dash', '~') FSPvalue.columns = df_cli_r.columns.str.replace('space', ' ') FSPvalue.columns = FSPvalue.columns.str.replace('parentheses1', '(') FSPvalue.columns = FSPvalue.columns.str.replace('parentheses2', ')') FSPvalue.index = FSPvalue.index.str.replace('dash', '~') return FeatureTraitSignificance, FSPvalue
[docs]def get_FeaturesPerModule(data, modColors, mode='dictionary'): """ Groups all experimental features by the co-expression module they belong to. :param data: pandas dataframe containing experimental data, with samples/subjects as rows and features as columns. :param ndarray modColors: array (numeric, character or a factor) attributing module colors to each feature in the experimental dataframe. :param str mode: type of the value returned by the function ('dictionary' or 'dataframe'). :return: Depending on selected mode, returns a dictionary or dataframe with module color per experimental feature. """ if mode == 'dataframe': features_per_module = dict(zip(data.columns, modColors)) features_per_module = pd.DataFrame(list(features_per_module.items()), columns=['name', 'modColor']) elif mode == 'dictionary': features_per_module = defaultdict(list) for k, v in zip(modColors, data.columns): features_per_module[k].append(v) features_per_module = dict((k, v) for k, v in features_per_module.items()) return features_per_module
[docs]def get_ModuleFeatures(data, modColors, modules=[]): """ Groups and returns a list of the experimental features clustered in specific co-expression modules. :param data: pandas dataframe containing experimental data, with samples/subjects as rows and features as columns. :param ndarray modColors: array (numeric, character or a factor) attributing module colors to each feature in the experimental dataframe. :param list modules: list of module colors of interest. :return: List of lists with experimental features in each selected module. """ allfeatures = get_FeaturesPerModule(data, modColors, mode='dictionary') modules = modules selectfeatures = [allfeatures[x] for x in modules] return selectfeatures
[docs]def get_EigengenesTrait_correlation(MEs, data): """ Eigengenes are used as representative profiles of the co-expression modules, and correlation between them is used to quantify module similarity. Clinical traits are added to the eigengenes to see how the traits fir into the eigengen network. :param MEs: pandas dataframe with module eigengenes. :param data: pandas dataframe containing clinical data, with samples/subjects as rows and clinical traits as columns. :return: Tuple with two pandas dataframes, one with features and traits recalculates module eigengenes dissimilarity, and another with all the overall correlations. """ METDiss = pd.DataFrame() METcor = 0 df_traits_r = data.copy() df_traits_r.columns = df_traits_r.columns.str.replace(' ', 'space') df_traits_r.columns = df_traits_r.columns.str.replace('(', 'parentheses1') df_traits_r.columns = df_traits_r.columns.str.replace(')', 'parentheses2') df_traits_r.columns = df_traits_r.columns.str.replace('/', 'slash') common = list(set(MEs.index).intersection(df_traits_r.index)) if len(common) > 0: MET = WGCNA.orderMEs(base.cbind(MEs.loc[common,:], df_traits_r.loc[common,:]), verbose=0) METcor = WGCNA.cor(MET, use='p', verbose=0) METcor = pd.DataFrame(METcor, MET.columns, MET.columns) METcor.columns = METcor.columns.str.replace('space', ' ') METcor.columns = METcor.columns.str.replace('parentheses1', '(') METcor.columns = METcor.columns.str.replace('parentheses2', ')') METcor.columns = METcor.columns.str.replace('slash', '/') METcor.index = METcor.index.str.replace('space', ' ') METcor.index = METcor.index.str.replace('parentheses1', '(') METcor.index = METcor.index.str.replace('parentheses2', ')') METcor.index = METcor.index.str.replace('slash', '/') METDiss = 1 - METcor return METDiss, METcor