Source code for ckg.graphdb_builder.databases.parsers.refseqParser

import os.path
from collections import defaultdict
from ckg.graphdb_builder import builder_utils

#########################
#          RefSeq       #
#########################
[docs]def parser(databases_directory, download=True): config = builder_utils.get_config(config_name="refseqConfig.yml", data_type='databases') url = config['refseq_url'] ftp_dir = config['refseq_ftp_dir'] entities = defaultdict(set) relationships = defaultdict(set) directory = os.path.join(databases_directory, "RefSeq") builder_utils.checkDirectory(directory) fileName = os.path.join(directory, url.split('/')[-1]) headers = config['headerEntities'] taxid = 9606 if download: file_dir = builder_utils.list_ftp_directory(ftp_dir)[0] new_file = file_dir.split('/')[-1]+"_feature_table.txt.gz" url = ftp_dir + file_dir.split('/')[-1] + "/" + new_file builder_utils.downloadDB(url, directory) fileName = os.path.join(directory, new_file) if os.path.isfile(fileName): df = builder_utils.read_gzipped_file(fileName) first = True for line in df: if first: first = False continue data = line.rstrip("\r\n").split("\t") tclass = data[1] assembly = data[2] chrom = data[5] geneAcc = data[6] start = data[7] end = data[8] strand = data[9] protAcc = data[10] name = data[13] symbol = data[14] if protAcc != "": entities["Transcript"].add((protAcc, "Transcript", name, tclass, assembly, taxid)) if chrom != "": entities["Chromosome"].add((chrom, "Chromosome", chrom, taxid)) relationships["LOCATED_IN"].add((protAcc, chrom, "LOCATED_IN", start, end, strand, "RefSeq")) if symbol != "": relationships["TRANSCRIBED_INTO"].add((symbol, protAcc, "TRANSCRIBED_INTO", "RefSeq")) elif geneAcc != "": entities["Transcript"].add((geneAcc, "Transcript", name, tclass, assembly, taxid)) if chrom != "": entities["Chromosome"].add((chrom, "Chromosome", chrom, taxid)) relationships["LOCATED_IN"].add((protAcc, chrom, "LOCATED_IN", start, end, strand, "RefSeq")) df.close() builder_utils.remove_directory(directory) return (entities, relationships, headers)
if __name__ == "__main__": base = os.path.dirname(os.path.abspath(__file__)) db_path = os.path.join(base, "../../../../data/databases") parser(db_path)