Source code for coralme.builder.homology

import re
import pandas
import logging


[docs]
class Homology(object):
	"""
	Homology class for storing information about homology of the
	main and reference organisms.

	This class contains methods to predict and process homology
	of the main and reference organisms. Homology is inferred from
	the reciprocal best hits of a BLAST. The results are used to
	update and complement the attributes of the class Organism.

	Parameters
	----------
	org : str
		Identifier of the main organism. Has to be the same as its
		containing folder name.

	ref : str
		Identifier of the reference organism. Has to be the same as
		its containing folder name.

	evalue : float
		E-value cutoff to call enzyme homologs from the BLAST. Two
		reciprocal best hits are considered homologs if their
		E-value is less than this parameter.
	"""

	def __init__(self, org, ref, evalue = False, verbose = False):
		self.org = org
		self.ref = ref

		column_names = 'qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore'.split('\t')
		self.org_df = pandas.read_csv(self.org.blast_directory + '/org_as_db.txt', sep = '\t', index_col = 0, names = column_names)
		self.ref_df = pandas.read_csv(self.org.blast_directory + '/ref_as_db.txt', sep = '\t', index_col = 0, names = column_names)

		self.get_mutual_hits(evalue = evalue)

	def get_mutual_hits(self, evalue = False, verbose = False):
		# to not import from coralme.builder.homology
		def get_top_hits(df, evalue = False):
			top_hits = {}
			for g in df.index.unique():
				hits = df.loc[g]
				if isinstance(hits, pandas.DataFrame):
					top_hit = hits.sort_values(by = 'evalue').iloc[0]
				else:
					top_hit = hits
				if evalue and top_hit['evalue'] > evalue:
					continue
				top_hits[g] = top_hit
			return top_hits

		org_df = self.org_df
		ref_df = self.ref_df

		if verbose:
			logging.warning('Getting top hits...')

		org_top_hits = get_top_hits(org_df, evalue = evalue)
		ref_top_hits = get_top_hits(ref_df, evalue = evalue)

		if verbose:
			logging.warning('Getting reciprocal top hits...')

		mutual_hits = {}
		for g, hit in org_top_hits.items():
			hit_g = hit['sseqid']

			if hit_g not in ref_top_hits:
				continue

			if g == ref_top_hits[hit_g]['sseqid']:
				mutual_hits[g] = {}
				mutual_hits[hit_g] = {}

				mutual_hits[g]['query'] = hit_g
				mutual_hits[g]['evalue'] = hit['evalue']

				mutual_hits[hit_g]['query'] = g
				mutual_hits[hit_g]['evalue'] = ref_top_hits[hit_g]['evalue']

		if verbose:
			logging.warning('Done')

		mutual_hits_df = pandas.DataFrame.from_dict(mutual_hits).T.sort_index()
		self.mutual_hits_df = mutual_hits_df
		self.mutual_hits = mutual_hits_df['query'].to_dict()

	def get_complex_homology(self):
		org_complexes_df = self.org.complexes_df
		ref_complexes_df = self.ref.complexes_df
		mutual_hits = self.mutual_hits

		# Get ref homology for complexes already annotated in BioCyc
		org_cplx_homolog = {}
		ref_cplx_homolog = {}
		not_annotated_candidates = set()
		warn_candidates = []

		for c, row in org_complexes_df.iterrows():
			if isinstance(row['genes'], float) or not row['genes']:
				continue

			genes = [re.findall(r'.*(?=\(\d*\))', g)[0] for g in row['genes'].split(' AND ')]

			if not genes or not set(genes).issubset(set(mutual_hits.keys())):
				continue  # All org genes must have a hit

			for g in genes:
				if c in org_cplx_homolog:
					break

				ref_gene = mutual_hits[g]
				ref_complexes = ref_complexes_df[ref_complexes_df['genes'].str.contains(ref_gene)]
				for rc, rrow in ref_complexes.iterrows():
					rgenes = [re.findall(r'.*(?=\(\d*\))', g)[0] for g in rrow['genes'].split(' AND ')]
					if not set(rgenes).issubset(set(mutual_hits.keys())):
						continue  # All ref genes must have a hit
					ogenes = [mutual_hits[og] for og in genes]
					if set(rgenes) == set(ogenes):  # Complex identified
						org_cplx_homolog[c] = rc
						ref_cplx_homolog[rc] = c
					else:
						if rc not in not_annotated_candidates:
							warn_candidates.append({
								'complex': c,
								'reference_complex' : rc
								})
						not_annotated_candidates.add(rc)

		not_annotated_candidates = not_annotated_candidates.difference(set(ref_cplx_homolog.keys()))
		logging.warning('{} complexes were mapped successfully'.format(len(org_cplx_homolog)))

		if warn_candidates:
			self.org.curation_notes['org.get_complex_homology'].append({
				'msg':'Some complexes were partial hits in the BLAST',
				'triggered_by':warn_candidates,
				'importance':'medium',
				'to_do':'Curate these manually in protein_corrections.txt'})
		for i in self.org.generic_dict.keys():
			org_cplx_homolog[i] = i
			ref_cplx_homolog[i] = i
		self.org_cplx_homolog = org_cplx_homolog
		self.ref_cplx_homolog = ref_cplx_homolog
		self.not_annotated_candidates = not_annotated_candidates