Project notebook with R

  • Easy access to all the experimental data, statistical analysis results and visualization of reports using R

  • All the R functionality at hand

Import R libraries

Make sure that all the libraries you plan to use have been previously installed.

If not, you can install them through the notebook.

E.g. install.packages(“tidyverse”)

[10]:
library(rhdf5)
library(rjson)
library(foreach)
library(plotly)
library(tidyverse)
library(networkD3)

Define custom functions

Within the Jupyter notebook you can create all the functions relevant to you work that are not implemented in the analytics core. This is the case of R functions. Here we define R functions that allow us to load a project from the CKG database, access different datasets and plots, and visualize the latter in the notebook.

[11]:
importGraph <- function(json_network){
  edges_df = data.frame(matrix(unlist(json_network$net_json$links), nrow=length(json_network$net_json$links), byrow=T))
  colnames(edges_df)<-names(unlist(json_network$net_json$links[[1]]))
  edges_df <- edges_df[c("source", "target", "width")]
  colnames(edges_df)<-c("source", "target", "weight")
  nodes = data.frame(matrix(unlist(json_network$net_json$nodes), nrow=length(json_network$net_json$nodes), byrow=T))
  colnames(nodes)<-names(unlist(json_network$net_json$nodes[[1]]))
  nodes['ident']<- 0:(dim(nodes)[1]-1)
  nodes <- nodes[c("ident", "id", "color")]
  colnames(nodes) <- c("id", "label", "color")
  edges <- edges_df %>%
    left_join(nodes, by = c("source" = "label")) %>%
    rename(from = id)
  edges <- edges %>%
    left_join(nodes, by = c("target" = "label")) %>%
    rename(to = id)
  edges <- edges[c("from", "to", "weight")]
  library(networkD3)
  g<-forceNetwork(Links = edges, Nodes = nodes,
                  height = 850, width = 800,
                  Source = "from", Target = "to",
                  Value = "weight", NodeID = "label",
                  Group = "color", opacity = 0.8,opacityNoHover = 1,
                  zoom = T, linkWidth = JS("function(d) { return Math.sqrt(d.weight); }"))
  return(g)
}

read_report <- function(report_file, report_name){
  plots <- c()
  nets <- c()
  if(file.exists(report_file)){
    content <- h5ls(report_file)
    foreach(group=unique(content$group)) %do% {
      if(group != "/"){
        if(!grepl('Table', group)){
          group = substring(group, 2)
          report_figure <- h5read(report_file, group)
          foreach(name=unique(names(report_figure))) %do% {
            if(grepl('figure',name)){
              report_figure_str<-fromJSON(report_figure[[name]][1])
              p = report_figure_str$props$figure
              plot <- list(plot=p)
              names(plot) <- paste(group,name,sep="_")
              plots<-append(plots, plot)
            }
            else if(grepl('net', name)){
              report_net_str <- fromJSON(report_figure[[name]][1])
              net <- list(net=report_net_str)
              names(net) <- name
              nets<-append(nets, net)
            }
          }
        }
      }
    }
  }
  h5closeAll()
  report <- list(name = report_name, plots = plots, nets = nets)
  class(report) <- "report"

  return(report)
}

read_dataset <- function(dataset_file, dataset_name){
  dataframes <- c()
  if(file.exists(dataset_file)){
    content <- h5ls(dataset_file)
    foreach(group=unique(content$group)) %do% {
      if(group != "/"){
        group = substring(group, 2)
        dataframe_json <- h5read(dataset_file, group)
        foreach(name=unique(names(dataframe_json))) %do% {
          if(is.null(names(dataframe_json[[name]]))){
          dataframe_str<-fromJSON(dataframe_json[[name]][1])}
          df <- data.frame(matrix(unlist(dataframe_str), nrow=length(dataframe_str), byrow=T))
          colnames(df)<-names(unlist(dataframe_str[[1]]))
          dataframe <- list(data=df)
          names(dataframe) <- name
          dataframes<-append(dataframes, dataframe)
        }
      }
    }
  }
  h5closeAll()
  dataset <- list(name=dataset_name, data=dataframes)
  class(dataset) <- "dataset"

  return(dataset)
}


load_project <- function(project_id, dataset){
  plots = c()
  nets = c()
  project_report_dir <- paste("../../../data/reports/", project_id, sep = "")
  project_dataset_report_dir <- paste(project_report_dir,dataset, sep="/")
  report_file = paste(project_dataset_report_dir, "report.h5", sep = "/")
  datasets_file = paste(project_dataset_report_dir, paste(dataset,"dataset.h5",sep='_'), sep = "/")
  report = read_report(report_file, paste(project_id, "Report", dataset))
  datasets = read_dataset(datasets_file, paste(project_id, "Dataset", dataset))
  project <- list(id=project_id, report=report, datasets=datasets)
  return(project)
}

Create a project object by loading an existent report

[12]:
p = load_project("P0000001", "proteomics")
Warning message in matrix(unlist(dataframe_str), nrow = length(dataframe_str), byrow = T):
“data length [56749] is not a sub-multiple or multiple of the number of rows [17298]”
Warning message in matrix(unlist(dataframe_str), nrow = length(dataframe_str), byrow = T):
“data length [942] is not a sub-multiple or multiple of the number of rows [188]”
Warning message in matrix(unlist(dataframe_str), nrow = length(dataframe_str), byrow = T):
“data length [140] is not a sub-multiple or multiple of the number of rows [8]”
Warning message in matrix(unlist(dataframe_str), nrow = length(dataframe_str), byrow = T):
“data length [152735] is not a sub-multiple or multiple of the number of rows [26135]”
Warning message in matrix(unlist(dataframe_str), nrow = length(dataframe_str), byrow = T):
“data length [152735] is not a sub-multiple or multiple of the number of rows [26135]”
Warning message in matrix(unlist(dataframe_str), nrow = length(dataframe_str), byrow = T):
“data length [19912] is not a sub-multiple or multiple of the number of rows [6187]”

Visualize the list of plots contained in the report

[13]:
names(p$report$plots)
  1. '0~proteomics_pipeline~cytoscape_network_0_figure'
  2. '11~stratification_description~description_0_figure'
  3. '12~stratification_pca~pca_0_figure'
  4. '13~regulation_description~description_0_figure'
  5. '15~regulation_samr~volcanoplot_0_figure'
  6. '15~regulation_samr~volcanoplot_1_figure'
  7. '15~regulation_samr~volcanoplot_2_figure'
  8. '15~regulation_samr~volcanoplot_3_figure'
  9. '15~regulation_samr~volcanoplot_4_figure'
  10. '15~regulation_samr~volcanoplot_5_figure'
  11. '15~regulation_samr~volcanoplot_6_figure'
  12. '15~regulation_samr~volcanoplot_7_figure'
  13. '15~regulation_samr~volcanoplot_8_figure'
  14. '15~regulation_samr~volcanoplot_9_figure'
  15. '22~literature_associations_publications_abstracts~wordcloud_0_figure'
  16. '2~peptides~barplot_0_figure'
  17. '4~proteins~barplot_0_figure'
  18. '6~modifications~facetplot_0_figure'
  19. '8~coefficient_variation_coefficient_of_variation~scatterplot_matrix_0_figure'
  20. '9~ranking_ranking_with_markers~ranking_0_figure'

Access a specific plot and use plotly to visualize it

[18]:
plotly_build(p$report$plots$`2~peptides~barplot_0_figure`)

In the case of networks, they have to be converted from a json format, to an edge list. We use R’s networkd3 to create a D3 JavaScript force directed network graph from it.

[15]:
importGraph(p$report$nets$`0_net`)
Warning message:
“Column `source`/`label` joining factors with different levels, coercing to character vector”
Warning message:
“Column `target`/`label` joining factors with different levels, coercing to character vector”

We can also, easily, access the different datasets from the project

[16]:
names(p$datasets$data)
  1. 'complex_associations'
  2. 'correlation_correlation'
  3. 'disease_associations'
  4. 'drug_associations'
  5. 'go annotation'
  6. 'go_enrichment_Biological_processes_regulation_enrichment'
  7. 'interaction_network'
  8. 'literature_associations_publications_abstracts'
  9. 'number of modified proteins'
  10. 'number of peptides'
  11. 'number of proteins'
  12. 'original'
  13. 'overview statistics_summary'
  14. 'pathway annotation'
  15. 'pathway_enrichment_Pathways_regulation_enrichment'
  16. 'processed'
  17. 'protein biomarkers'
  18. 'regulated'
  19. 'regulation table'
  20. 'Data Matrix Shape'
  21. 'Stats'
[17]:
p$datasets$data$correlation_correlation
A data.frame: 74922 × 6
node1node2weightpvaluepadjrejected
<fct><fct><fct><fct><fct><fct>
A30~A2MYE2 A2M~P01023 0.2892365267 0.0 0.0 TRUE
ABI3BP~Q7Z7G0 A30~A2MYE2 -0.31295426430.0 0.0 TRUE
ACE~P12821 A2M~P01023 0.2904851136 0.000590520.00164552TRUE
ACE~P12821 ABI3BP~Q7Z7G00.0026333708 3e-08 2e-07 TRUE
ACTB~P60709 A2M~P01023 -0.15032529931.4e-07 8.5e-07 TRUE
ACTB~P60709 ABI3BP~Q7Z7G0-0.12135720780.009595610.01965832TRUE
ACTB~P60709 ACE~P12821 -0.01205690510.001728830.00428982TRUE
ACTN1~P12814 A30~A2MYE2 -0.09531858744.137e-05 0.00014992TRUE
ACTN1~P12814 ABI3BP~Q7Z7G00.2159122385 0.0 0.0 TRUE
ACTN1~P12814 ACE~P12821 0.0337919091 0.001460440.00369332TRUE
ACTN1~P12814 ACTB~P60709 0.2556916718 2.296e-05 8.778e-05 TRUE
ADA2~Q9NZK5 A2M~P01023 0.2405633762 0.0 1e-08 TRUE
ADA2~Q9NZK5 ABI3BP~Q7Z7G0-0.13787339980.0068122 0.01452743TRUE
ADA2~Q9NZK5 ACTB~P60709 0.0488541846 0.002289050.00551185TRUE
ADA2~Q9NZK5 ACTN1~P12814 0.1606489207 9.2e-07 4.71e-06 TRUE
ADAMTS13~Q76LX8A2M~P01023 0.228562439 1.3e-07 7.7e-07 TRUE
ADAMTS13~Q76LX8ACE~P12821 0.1100432326 9.62e-06 3.996e-05 TRUE
ADAMTS13~Q76LX8ACTB~P60709 -0.12984914450.0 0.0 TRUE
ADAMTS13~Q76LX8ADA2~Q9NZK5 0.0882539027 0.0 0.0 TRUE
ADAMTSL4~Q6UY14A2M~P01023 -0.01671110690.0258627 0.0469946 TRUE
ADAMTSL4~Q6UY14ABI3BP~Q7Z7G0-0.02978281792.1e-07 1.21e-06 TRUE
ADAMTSL4~Q6UY14ACE~P12821 0.1867241047 2e-08 1.1e-07 TRUE
ADAMTSL4~Q6UY14ACTB~P60709 0.0422307278 0.0 0.0 TRUE
ADAMTSL4~Q6UY14ACTN1~P12814 0.167608862 0.0 0.0 TRUE
ADAMTSL4~Q6UY14ADA2~Q9NZK5 -0.04561400593.96e-06 1.779e-05 TRUE
ADH4~P08319 A2M~P01023 -0.11359717140.013337680.02626505TRUE
ADH4~P08319 A30~A2MYE2 0.0504134162 0.0 0.0 TRUE
ADH4~P08319 ACE~P12821 0.065413239 0.0 0.0 TRUE
ADH4~P08319 ACTB~P60709 -0.08051200540.0 0.0 TRUE
ADH4~P08319 ACTN1~P12814 0.0217710049 0.0 0.0 TRUE
scFv~Q65ZC9SERPINA5~P05154-0.113307701 1.005e-05 4.157e-05 TRUE
scFv~Q65ZC9SERPINA7~P055430.0029215059 0.006558860.01404733TRUE
scFv~Q65ZC9SERPINF1~P36955-0.03586426980.0 0.0 TRUE
scFv~Q65ZC9SERPING1~P05155-0.46450538260.011860570.02368813TRUE
scFv~Q65ZC9SHBG~P04278 0.0597719401 5.92e-06 2.568e-05 TRUE
scFv~Q65ZC9SNCA~P37840 0.0526730489 0.0 0.0 TRUE
scFv~Q65ZC9SOD3~P08294 0.065714844 0.001345670.00343099TRUE
scFv~Q65ZC9SPARCL1~Q14515 0.2050409349 4e-08 2.6e-07 TRUE
scFv~Q65ZC9SPINK5~Q9NQ38 0.0993925633 0.0 0.0 TRUE
scFv~Q65ZC9SPP2~Q13103 0.1951832721 0.000192870.00060287TRUE
scFv~Q65ZC9STMN1~P16949 -0.19164591180.0 0.0 TRUE
scFv~Q65ZC9THBS1~P07996 -0.08333917981.277e-05 5.16e-05 TRUE
scFv~Q65ZC9THBS4~P35443 -0.010593396 0.000431770.00124437TRUE
scFv~Q65ZC9TKT~P29401 -0.244463802 0.001388870.00353092TRUE
scFv~Q65ZC9TNC~A0A024R884 0.2552310889 0.0 0.0 TRUE
scFv~Q65ZC9TNN~Q9UQP3 -0.10756610310.002427620.00580852TRUE
scFv~Q65ZC9TNXB~P22105 0.2817641092 3.13e-05 0.00011634TRUE
scFv~Q65ZC9TPI1~P60174 -0.22982250460.0 1e-08 TRUE
scFv~Q65ZC9V1-13~Q5NV69 0.1054214299 0.003063980.00714884TRUE
scFv~Q65ZC9V2-13~Q5NV73 0.4771154135 2e-08 1.3e-07 TRUE
scFv~Q65ZC9V4-2~Q5NV82 0.2165834919 1.139e-05 4.654e-05 TRUE
scFv~Q65ZC9V5-2~A2MYC8 0.1808639979 0.0 0.0 TRUE
scFv~Q65ZC9V5-4~Q5NV79 -0.08533651766.394e-05 0.0002221 TRUE
scFv~Q65ZC9VASN~Q6EMK4 0.1403713766 4e-08 2.9e-07 TRUE
scFv~Q65ZC9VCAM1~P19320 -0.05427300410.009159230.01886712TRUE
scFv~Q65ZC9VH6DJ~A2N0T4 0.2625776727 8e-08 5e-07 TRUE
scFv~Q65ZC9VIM~P08670 0.0453630504 0.000129940.00042167TRUE
scFv~Q65ZC9VK3~A2N2F4 0.1745551542 0.022515280.04162447TRUE
scFv~Q65ZC9VNN1~O95497 -0.23294967120.011744170.02348694TRUE
scFv~Q65ZC9YWHAZ~P63104 0.2255080844 2.1e-07 1.21e-06 TRUE
[ ]: