Source code for ckg.graphdb_builder.databases.parsers.intactParser

import os.path
import re
from collections import defaultdict
from ckg.graphdb_builder import builder_utils

#########################
#          IntAct       #
#########################
[docs]def parser(databases_directory, download=True): intact_dictionary = defaultdict() stored = set() relationships = set() config = builder_utils.get_config(config_name="intactConfig.yml", data_type='databases') header = config['header'] outputfileName = "intact_interacts_with.tsv" regex = r"\((.*)\)" taxid_regex = r"\:(\d+)" url = config['intact_psimitab_url'] directory = os.path.join(databases_directory, "Intact") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) if download: builder_utils.downloadDB(url, directory) with open(fileName, 'r', encoding="utf-8") as idf: first = True for line in idf: if first: first = False continue data = line.rstrip("\r\n").split("\t") intA = data[0].split(":")[1] intB = data[1].split(':') if len(intB)> 1: intB = intB[1] else: continue methodMatch = re.search(regex, data[6]) method = methodMatch.group(1) if methodMatch else "unknown" publications = data[8] tAmatch = re.search(taxid_regex, data[9]) tBmatch = re.search(taxid_regex, data[10]) taxidA = "" taxidB = "" if tAmatch and tBmatch: taxidA = tAmatch.group(1) taxidB = tBmatch.group(1) itypeMatch = re.search(regex, data[11]) itype = itypeMatch.group(1) if itypeMatch else "unknown" sourceMatch = re.search(regex, data[12]) source = sourceMatch.group(1) if sourceMatch else "unknown" score = data[14].split(":")[1] if builder_utils.is_number(score): score = float(score) else: continue if taxidA == "9606" and taxidB == "9606": if (intA, intB) in intact_dictionary: intact_dictionary[(intA, intB)]['methods'].add(method) intact_dictionary[(intA, intB)]['sources'].add(source) intact_dictionary[(intA, intB)]['publications'].add(publications.replace('|', ',')) intact_dictionary[(intA, intB)]['itype'].add(itype) else: intact_dictionary[(intA, intB)] = {'methods': set([method]), 'sources': set([source]), 'publications': set([publications]), 'itype': set([itype]), 'score': score} for (intA, intB) in intact_dictionary: if (intA, intB, intact_dictionary[(intA, intB)]["score"]) not in stored: relationships.add((intA, intB, "CURATED_INTERACTS_WITH", intact_dictionary[(intA, intB)]['score'], ",".join(intact_dictionary[(intA, intB)]['itype']), ",".join(intact_dictionary[(intA, intB)]['methods']), ",".join(intact_dictionary[(intA, intB)]['sources']), ",".join(intact_dictionary[(intA, intB)]['publications']))) stored.add((intA, intB, intact_dictionary[(intA, intB)]["score"])) builder_utils.remove_directory(directory) return (relationships, header, outputfileName)