Source code for ckg.graphdb_builder.experiments.parsers.clinicalParser

import os
import re
import pandas as pd
import numpy as np
from ckg import ckg_utils
from ckg.graphdb_builder import builder_utils


[docs]def parser(projectId, type='clinical'): data = {} experiments_directory = ckg_utils.read_ckg_config(key='experiments_directory') project_directory = os.path.join(experiments_directory, 'PROJECTID/project/') clinical_directory = os.path.join(experiments_directory, 'PROJECTID/clinical/') design_directory = os.path.join(experiments_directory, 'PROJECTID/experimental_design/') project_directory = project_directory.replace('PROJECTID', projectId) clinical_directory = clinical_directory.replace('PROJECTID', projectId) design_directory = design_directory.replace('PROJECTID', projectId) config = builder_utils.get_config(config_name="clinical.yml", data_type='experiments') if type == 'project': project_dfs = project_parser(projectId, config, project_directory) data.update(project_dfs) elif type == 'experimental_design': design_dfs = experimental_design_parser(projectId, config, design_directory) data.update(design_dfs) elif type == 'clinical': clinical_dfs = clinical_parser(projectId, config, clinical_directory) data.update(clinical_dfs) return data
[docs]def project_parser(projectId, config, directory): data = {} project_data = parse_dataset(projectId, config, directory, key='project') if project_data is not None: data[('info', 'w')] = extract_project_info(project_data) data[('responsibles', 'w')] = extract_responsible_rels(project_data, separator=config['separator']) data[('participants', 'w')] = extract_participant_rels(project_data, separator=config['separator']) data[('studies_tissue', 'w')] = extract_project_tissue_rels(project_data, separator=config['separator']) data[('studies_disease', 'w')] = extract_project_disease_rels(project_data, separator=config['separator']) data[('studies_intervention', 'w')] = extract_project_intervention_rels(project_data, separator=config['separator']) data[('follows_up_project', 'w')] = extract_project_rels(project_data, separator=config['separator']) data[('timepoint', 'w')] = extract_timepoints(project_data, separator=config['separator']) return data
[docs]def experimental_design_parser(projectId, config, directory): data = {} design_data = parse_dataset(projectId, config, directory, key='design') if design_data is not None: data[('project', 'w')] = extract_project_subject_rels(projectId, design_data) data[('subjects', 'w')] = extract_subject_identifiers(design_data) data[('biological_samples', 'w')] = extract_biosample_identifiers(design_data) data[('analytical_samples', 'w')] = extract_analytical_sample_identifiers(design_data) data[('analytical_samples_info', 'w')] = extract_analytical_samples_info(design_data) data[('subject_biosample', 'w')] = extract_biological_sample_subject_rels(design_data) data[('biosample_analytical', 'w')] = extract_biological_sample_analytical_sample_rels(design_data) return data
[docs]def clinical_parser(projectId, config, clinical_directory): data = {} clinical_data = parse_dataset(projectId, config, clinical_directory, key='clinical') if clinical_data is not None: data[('biosamples_info', 'w')] = extract_biological_samples_info(clinical_data) data[('biosample_analytical_attributes', 'w')] = extract_biosample_analytical_sample_relationship_attributes(clinical_data) data[('biological_sample_at_timepoint', 'w')] = extract_biological_sample_timepoint_rels(clinical_data) data[('biosample_tissue', 'w')] = extract_biological_sample_tissue_rels(clinical_data) data[('disease', 'w')] = extract_subject_disease_rels(clinical_data, separator=config['separator']) data[('subject_had_intervention', 'w')] = extract_subject_intervention_rels(clinical_data, separator=config['separator']) clinical_state, clinical_quant = extract_biological_sample_clinical_variables_rels(clinical_data) data[('clinical_state', 'w')] = clinical_state data[('clinical_quant', 'w')] = clinical_quant return data
[docs]def parse_dataset(projectId, configuration, dataDir, key='project'): '''This function parses clinical data from subjects in the project Input: uri of the clinical data file. Format: Subjects as rows, clinical variables as columns Output: pandas DataFrame with the same input format but the clinical variables mapped to the right ontology (defined in config), i.e. type = -40 -> SNOMED CT''' data = None if 'file_'+key in configuration: data_file = configuration['file_'+key].replace('PROJECTID', projectId) files = os.listdir(dataDir) regex = r"{}.+".format(data_file) r = re.compile(regex) filename = list(filter(r.match, files)) if len(filename) > 0: filepath = os.path.join(dataDir, filename.pop()) if os.path.isfile(filepath): data = builder_utils.readDataset(filepath) return data
[docs]def extract_project_info(project_data): cols = ['internal_id', 'name', 'acronym', 'description', 'related_to', 'datatypes', 'timepoints', 'disease', 'tissue', 'intervention', 'responsible', 'participant', 'start_date', 'end_date', 'status', 'external_id'] n = len(cols) df = project_data.copy() if len(df.columns) == n: df.columns = cols else: raise Exception("Project data requires {} columns, {} provided.\n Provide the following columns: {}".format(n, len(df.columns), ",".join(cols))) return df
[docs]def extract_responsible_rels(project_data, separator='|'): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'responsible' in project_data: if not pd.isna(project_data['responsible'][0]): df = pd.DataFrame(project_data.responsible.str.split(separator).tolist()).T.rename(columns={0:'START_ID'}) df['END_ID'] = project_data['external_id'][0] df['TYPE'] = 'IS_RESPONSIBLE' return df
[docs]def extract_participant_rels(project_data, separator='|'): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'participant' in project_data: if not pd.isna(project_data['participant'][0]): df = pd.DataFrame(project_data.participant.str.split(separator).tolist()).T.rename(columns={0:'START_ID'}) df['END_ID'] = project_data['external_id'][0] df['TYPE'] = 'PARTICIPATES_IN' return df
[docs]def extract_project_tissue_rels(project_data, separator='|'): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'tissue' in project_data: if not pd.isna(project_data['tissue'][0]): tissues = project_data['tissue'][0].split(separator) df = pd.DataFrame(tissues, columns=['END_ID']) df.insert(loc=0, column='START_ID', value=project_data['external_id'][0]) df['TYPE'] = 'STUDIES_TISSUE' return df
[docs]def extract_project_disease_rels(project_data, separator='|'): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'disease' in project_data: if not pd.isna(project_data['disease'][0]): diseases = project_data['disease'][0].split(separator) df = pd.DataFrame(diseases, columns=['END_ID']) df.insert(loc=0, column='START_ID', value=project_data['external_id'][0]) df['TYPE'] = 'STUDIES_DISEASE' return df
[docs]def extract_project_intervention_rels(project_data, separator='|'): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'intervention' in project_data: if not pd.isna(project_data['intervention'][0]): interventions = project_data['intervention'][0].split(separator) ids = [re.search(r'\(([^)]+)', x.split()[-1]).group(1) for x in interventions] df = pd.DataFrame(ids, columns=['END_ID']) df.insert(loc=0, column='START_ID', value=project_data['external_id'][0]) df['TYPE'] = 'STUDIES_INTERVENTION' return df
[docs]def extract_project_rels(project_data, separator='|'): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'related_to' in project_data: if not pd.isna(project_data['related_to'][0]): related_projects = project_data['related_to'][0].split(separator) df = pd.DataFrame(related_projects, columns=['END_ID']) df.insert(loc=0, column='START_ID', value=project_data['external_id'][0]) df['TYPE'] = 'FOLLOWS_UP_PROJECT' return df
[docs]def extract_timepoints(project_data, separator='|'): df = pd.DataFrame(columns=['ID', 'units', 'type']) if 'timepoints' in project_data: if not pd.isna(project_data['timepoints'][0]): df = pd.DataFrame(project_data['timepoints'][0].replace(' ', '').split(separator)) df = df[0].str.extract(r'([\-\d]+)([a-zA-Z]+)', expand=True) df.columns = ['ID', 'units'] df['type'] = 'Timepoint' return df
[docs]def extract_project_subject_rels(projectId, design_data): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'subject id' in design_data: if not pd.isna(design_data['subject id']).any(): df = pd.DataFrame(design_data['subject id'].dropna().unique(), columns=['END_ID']) df.insert(loc=0, column='START_ID', value=projectId) df['TYPE'] = 'HAS_ENROLLED' return df
[docs]def extract_subject_identifiers(design_data): df = pd.DataFrame(columns=['ID', 'external_id']) data = design_data.set_index('subject id').copy() if not pd.isna(data['subject external_id']).any(): df = data[['subject external_id']].dropna(axis=0).reset_index() df = df.drop_duplicates(keep='first').reset_index(drop=True) df.columns = ['ID', 'external_id'] return df
[docs]def extract_biosample_identifiers(design_data): df = pd.DataFrame(columns=['ID', 'external_id']) data = design_data.set_index('biological_sample id').copy() if not pd.isna(data['biological_sample external_id']).any(): df = data[['biological_sample external_id']].dropna(axis=0).reset_index() df = df.drop_duplicates(keep='first').reset_index(drop=True) df.columns = ['ID', 'external_id'] return df
[docs]def extract_analytical_sample_identifiers(design_data): df = pd.DataFrame(columns=['ID', 'external_id']) data = design_data.set_index('analytical_sample id').copy() if not pd.isna(data['analytical_sample external_id']).any(): df = data[['analytical_sample external_id']].dropna(axis=0).reset_index() df = df.drop_duplicates(keep='first').reset_index(drop=True) df.columns = ['ID', 'external_id'] return df
[docs]def extract_biological_sample_subject_rels(design_data): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'biological_sample id' in design_data: if not pd.isna(design_data['biological_sample id']).any(): df = design_data[['biological_sample id', 'subject id']].drop_duplicates(keep='first').reset_index(drop=True) df.columns = ['START_ID', 'END_ID'] df['TYPE'] = 'BELONGS_TO_SUBJECT' return df
[docs]def extract_biological_sample_analytical_sample_rels(design_data): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'analytical_sample id' in design_data: if not pd.isna(design_data['analytical_sample external_id']).any(): df = design_data[['biological_sample id', 'analytical_sample id']].drop_duplicates(keep='first').reset_index(drop=True) df.columns = ['START_ID', 'END_ID'] df['TYPE'] = 'SPLITTED_INTO' return df
[docs]def extract_biological_samples_info(clinical_data): df = pd.DataFrame(columns=['ID']) if 'biological_sample external_id' in clinical_data: if not pd.isna(clinical_data['biological_sample external_id']).any(): cols = [i for i in clinical_data.columns if str(i).startswith('biological_sample')] df = clinical_data[cols] df.columns = [col.replace('biological_sample ', '') for col in cols] df = df.rename(columns={'external_id': 'ID'}) df = df.drop_duplicates(keep='first').reset_index(drop=True) return df
[docs]def extract_analytical_samples_info(data): df = pd.DataFrame(columns=['ID', 'group', 'secondary_group', 'batch']) if 'analytical_sample external_id' in data: if not pd.isna(data['analytical_sample external_id']).any(): df = data.copy() df.columns = [col.replace('analytical_sample ', '') for col in df.columns] df = df.rename(columns={'external_id': 'ID', 'grouping1':'group', 'grouping2':'secondary_group'}) if 'batch' not in df: df['batch'] = None return df
[docs]def extract_biosample_analytical_sample_relationship_attributes(clinical_data): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'quantity', 'quantity_units']) if 'analytical_sample external_id' in clinical_data: if not pd.isna(clinical_data['analytical_sample external_id']).any(): cols = ['biological_sample external_id', 'analytical_sample external_id'] edge_cols = ['START_ID', 'END_ID'] if 'analytical_sample quantity' in clinical_data: cols.append('analytical_sample quantity') edge_cols.append('quantity') if 'analytical_sample quantity_units' in clinical_data: cols.append('analytical_sample quantity_units') edge_cols.append('quantity_units') df = clinical_data[cols].drop_duplicates(keep='first').reset_index(drop=True) df.columns = edge_cols return df
[docs]def extract_biological_sample_timepoint_rels(clinical_data): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE', 'timepoint_units', 'intervention']) if 'timepoint' in clinical_data: if not pd.isna(clinical_data['timepoint']).all(): df = clinical_data[['biological_sample external_id', 'timepoint', 'timepoint units', 'intervention id']].drop_duplicates(keep='first').reset_index(drop=True) df['intervention id'] = df['intervention id'].replace(np.nan, 0).astype('int64').astype('str').replace('0', np.nan) df.columns = ['START_ID', 'END_ID', 'timepoint_units', 'intervention'] df.insert(loc=2, column='TYPE', value='SAMPLE_AT_TIMEPOINT') return df
[docs]def extract_biological_sample_tissue_rels(clinical_data): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'tissue id' in clinical_data: if not pd.isna(clinical_data['tissue id']).all(): df = clinical_data[['biological_sample external_id', 'tissue id']].drop_duplicates(keep='first').reset_index(drop=True) df.columns = ['START_ID', 'END_ID'] df['TYPE'] = 'FROM_TISSUE' return df
[docs]def extract_subject_disease_rels(clinical_data, separator='|'): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE']) if 'disease id' in clinical_data: if not pd.isna(clinical_data['disease id']).all(): clinical_data['disease id'] = clinical_data['disease id'].astype(str) df = pd.DataFrame(clinical_data['disease id'].str.split(separator).tolist(), index=clinical_data['subject external_id']).stack() df = df.reset_index([0, 'subject external_id']).replace('nan', np.nan).dropna().drop_duplicates(keep='first') df.columns = ['START_ID', 'END_ID'] df['TYPE'] = 'HAS_DISEASE' return df
[docs]def extract_subject_intervention_rels(clinical_data, separator='|'): df = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE', 'in_combination', 'response']) if 'had_intervention' in clinical_data: if not pd.isna(clinical_data['had_intervention']).all(): interventions = clinical_data.set_index('subject external_id')['had_intervention'].astype(str).str.split(separator, expand=True).stack().str.strip().reset_index(level=1, drop=True) types = clinical_data.set_index('subject external_id')['had_intervention_type'].astype(str).str.split(separator, expand=True).stack().str.strip().reset_index(level=1, drop=True) combi = clinical_data.set_index('subject external_id')['had_intervention_in_combination'].astype(str).str.split(separator, expand=True).stack().str.strip().reset_index(level=1, drop=True) response = clinical_data.set_index('subject external_id')['had_intervention_response'].astype(str).str.split(separator, expand=True).stack().str.strip().reset_index(level=1, drop=True) df = pd.concat([interventions, types, combi, response], axis=1, join='inner') df = df.reset_index() df.columns = ['START_ID', 'END_ID', 'type', 'in_combination', 'response'] df.insert(loc=2, column='TYPE', value='HAD_INTERVENTION') return df
[docs]def extract_biological_sample_clinical_variables_rels(clinical_data): df_quant = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE', 'value']) df_state = pd.DataFrame(columns=['START_ID', 'END_ID', 'TYPE', 'value']) if 'biological_sample external_id' in clinical_data: df = clinical_data.set_index('biological_sample external_id').copy() df.columns = [i.split()[-1] for i in df.columns] df.columns = df.columns.str.extract(r'.*\((.*)\).*')[0].tolist() df_quant = df._get_numeric_data() df_state = df.loc[:, ~df.columns.isin(df_quant.columns.tolist())] if not df_quant.empty: df_quant = df_quant.stack().reset_index().drop_duplicates(keep='first').dropna() df_quant.columns = ['START_ID', 'END_ID', 'value'] df_quant = df_quant.drop_duplicates() df_quant.insert(loc=2, column='TYPE', value='HAS_QUANTIFIED_CLINICAL') if not df_state.empty: df_state = df_state.stack().reset_index().drop_duplicates(keep='first').dropna() df_state.columns = ['START_ID', 'END_ID', 'value'] df_state = df_state.drop_duplicates() df_state.insert(loc=2, column='TYPE', value='HAS_CLINICAL_STATE') return df_state, df_quant