Source code for coralme.builder.homology

import re
import pandas
import logging

[docs] class Homology(object): """ Homology class for storing information about homology of the main and reference organisms. This class contains methods to predict and process homology of the main and reference organisms. Homology is inferred from the reciprocal best hits of a BLAST. The results are used to update and complement the attributes of the class Organism. Parameters ---------- org : str Identifier of the main organism. Has to be the same as its containing folder name. ref : str Identifier of the reference organism. Has to be the same as its containing folder name. evalue : float E-value cutoff to call enzyme homologs from the BLAST. Two reciprocal best hits are considered homologs if their E-value is less than this parameter. """ def __init__(self, org, ref, evalue = False, verbose = False): self.org = org self.ref = ref column_names = 'qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore'.split('\t') self.org_df = pandas.read_csv(self.org.blast_directory + '/org_as_db.txt', sep = '\t', index_col = 0, names = column_names) self.ref_df = pandas.read_csv(self.org.blast_directory + '/ref_as_db.txt', sep = '\t', index_col = 0, names = column_names) self.get_mutual_hits(evalue = evalue) def get_mutual_hits(self, evalue = False, verbose = False): # to not import from coralme.builder.homology def get_top_hits(df, evalue = False): top_hits = {} for g in df.index.unique(): hits = df.loc[g] if isinstance(hits, pandas.DataFrame): top_hit = hits.sort_values(by = 'evalue').iloc[0] else: top_hit = hits if evalue and top_hit['evalue'] > evalue: continue top_hits[g] = top_hit return top_hits org_df = self.org_df ref_df = self.ref_df if verbose: logging.warning('Getting top hits...') org_top_hits = get_top_hits(org_df, evalue = evalue) ref_top_hits = get_top_hits(ref_df, evalue = evalue) if verbose: logging.warning('Getting reciprocal top hits...') mutual_hits = {} for g, hit in org_top_hits.items(): hit_g = hit['sseqid'] if hit_g not in ref_top_hits: continue if g == ref_top_hits[hit_g]['sseqid']: mutual_hits[g] = {} mutual_hits[hit_g] = {} mutual_hits[g]['query'] = hit_g mutual_hits[g]['evalue'] = hit['evalue'] mutual_hits[hit_g]['query'] = g mutual_hits[hit_g]['evalue'] = ref_top_hits[hit_g]['evalue'] if verbose: logging.warning('Done') mutual_hits_df = pandas.DataFrame.from_dict(mutual_hits).T.sort_index() self.mutual_hits_df = mutual_hits_df self.mutual_hits = mutual_hits_df['query'].to_dict() def get_complex_homology(self): org_complexes_df = self.org.complexes_df ref_complexes_df = self.ref.complexes_df mutual_hits = self.mutual_hits # Get ref homology for complexes already annotated in BioCyc org_cplx_homolog = {} ref_cplx_homolog = {} not_annotated_candidates = set() warn_candidates = [] for c, row in org_complexes_df.iterrows(): if isinstance(row['genes'], float) or not row['genes']: continue genes = [re.findall(r'.*(?=\(\d*\))', g)[0] for g in row['genes'].split(' AND ')] if not genes or not set(genes).issubset(set(mutual_hits.keys())): continue # All org genes must have a hit for g in genes: if c in org_cplx_homolog: break ref_gene = mutual_hits[g] ref_complexes = ref_complexes_df[ref_complexes_df['genes'].str.contains(ref_gene)] for rc, rrow in ref_complexes.iterrows(): rgenes = [re.findall(r'.*(?=\(\d*\))', g)[0] for g in rrow['genes'].split(' AND ')] if not set(rgenes).issubset(set(mutual_hits.keys())): continue # All ref genes must have a hit ogenes = [mutual_hits[og] for og in genes] if set(rgenes) == set(ogenes): # Complex identified org_cplx_homolog[c] = rc ref_cplx_homolog[rc] = c else: if rc not in not_annotated_candidates: warn_candidates.append({ 'complex': c, 'reference_complex' : rc }) not_annotated_candidates.add(rc) not_annotated_candidates = not_annotated_candidates.difference(set(ref_cplx_homolog.keys())) logging.warning('{} complexes were mapped successfully'.format(len(org_cplx_homolog))) if warn_candidates: self.org.curation_notes['org.get_complex_homology'].append({ 'msg':'Some complexes were partial hits in the BLAST', 'triggered_by':warn_candidates, 'importance':'medium', 'to_do':'Curate these manually in protein_corrections.txt'}) for i in self.org.generic_dict.keys(): org_cplx_homolog[i] = i ref_cplx_homolog[i] = i self.org_cplx_homolog = org_cplx_homolog self.ref_cplx_homolog = ref_cplx_homolog self.not_annotated_candidates = not_annotated_candidates