diff --git a/edgehog/edgehog.py b/edgehog/edgehog.py index 72c04c8..b8cdc10 100644 --- a/edgehog/edgehog.py +++ b/edgehog/edgehog.py @@ -25,6 +25,7 @@ def main(): arg_parser.add_argument('--gff_directory', type=str, help='path to directory with the gffs of extant genomes ' '(each gff file must be named according to the name of an extant genome / leaf on the species tree)') arg_parser.add_argument('--hdf5', type=str, help='path to the hdf5 file (alternative to gff_directory to run edgeHOG on the entire OMA database)') + arg_parser.add_argument('--orient_edges', action='store_true', help='whether the transcriptional orientation of edges should be predicted') arg_parser.add_argument('--date_edges', action='store_true', help='whether the age of edges in extant species should be predicted') arg_parser.add_argument('--phylostratify', action='store_true', help='whether the number of edge retention, gain and loss should be analyzed for each node of the species tree') arg_parser.add_argument('--max_gaps', type=int, help='max_gaps can be seen as the theoritical maximal number of consecutive novel genes that can emerge between two older genes (default = 3), ' @@ -49,20 +50,20 @@ def main(): start_time = time.time() if args.gff_directory: - init_extant_graphs(1, ham, args.hogs, args.gff_directory, hogxml_entries, protein_id_to_hogxml_entry) + init_extant_graphs(1, ham, args.hogs, args.gff_directory, hogxml_entries, protein_id_to_hogxml_entry, args.orient_edges) elif args.hdf5: - init_extant_graphs_from_hdf5(ham, args.hdf5) + init_extant_graphs_from_hdf5(ham, args.hdf5, args.orient_edges) end_time = time.time() timer["preprocessing - loading extant synteny"] = end_time - start_time start_time = time.time() - leaves_to_root_synteny_propagation(ham, args.max_gaps) + leaves_to_root_synteny_propagation(ham, args.max_gaps, args.orient_edges) end_time = time.time() timer["bottom-up phase"] = end_time - start_time start_time = time.time() - root_to_leaves_edge_trimming(ham) + root_to_leaves_edge_trimming(ham, args.orient_edges) end_time = time.time() timer["top-down phase"] = end_time - start_time diff --git a/edgehog/infer_ancestral_synteny_graphs.py b/edgehog/infer_ancestral_synteny_graphs.py index 998d7d9..d734e2c 100644 --- a/edgehog/infer_ancestral_synteny_graphs.py +++ b/edgehog/infer_ancestral_synteny_graphs.py @@ -14,6 +14,55 @@ def print_graph_info(genome, graph): (genome.name, graph.number_of_nodes(), graph.number_of_edges(), nx.number_connected_components(graph), average_degree)) +#################################################################################### +# GOAL: +# Assign transcriptional orientations' weight in a reconnected edge based on +# weights of the 2 trimmed edges + +# Reconnection of edges (when we model gene emergence): +# u{A,B}: weight of unidirectional context -> -> in edges A and B +# d{A,B}: weight of divergent context <- -> in edges A and B +# c{A,B}: weight of convergent context -> <- in edges A and +# When middle gene is removed: +# -> (->) -> becomes -> -> Hence u = uA + uB or uB + uA +# -> (<-) -> becomes -> -> Hence u = cA + dB or dA + cB +# -> (<-) <- becomes -> <- Hence c = cA + uB or uA + cB +# <- (->) -> becomes <- -> Hence d = dA + uB or uA + dB +# But the weight of a reconnected edge is min(wA,wB) so u+c+d should be equal to min(wA,wB) +# This is done like this: +# f = min(wA, wB) +# tot = usum + csum + dsum +# u = f * usum / tot +# c = f * csum / tot +# d = f * dsum / tot +#################################################################################### +def orient_reconnected_edges(wA, uA, cA, dA, wB, uB, cB, dB): + usum = csum = dsum = 0 + if uA > 0: + if uB > 0: + usum += 2 * (uA + uB) + if cB > 0: + csum += uA + cB + if dB > 0: + dsum += uA + dB + if cA > 0: + if dB > 0: + usum += cA + dB + if uB > 0: + csum += cA + uB + if dA > 0: + if cB > 0: + usum += dA + cB + if uB > 0: + dsum += dA + uB + tot = usum + csum + dsum + if tot > 0: + f = min(wA, wB) + u = f * usum / tot + c = f * csum / tot + d = f * dsum / tot + return(u,c,d) + return(0,0,0) #################################################################################### # GOAL: @@ -63,7 +112,7 @@ def print_graph_info(genome, graph): # A-E # weight = 8 #################################################################################### -def build_transitive_graph(graph, max_gap_length = 2): +def build_transitive_graph(graph, orient_edges, max_gap_length = 2): real_edge_to_transitive_edge = dict() reconnected_edge_to_real_edges = dict() cnt_removed_nodes = 0 @@ -88,6 +137,17 @@ def build_transitive_graph(graph, max_gap_length = 2): weight = min(graph[left_gene][gene]["weight"], graph[gene][right_gene]["weight"]), children = list(set(graph[left_gene][gene]["children"] + graph[gene][right_gene]["children"])), extant_descendants = list(set(graph[left_gene][gene]["extant_descendants"] + graph[gene][right_gene]["extant_descendants"]))) + if orient_edges: + try: + print(left_gene.hog_id, right_gene.hog_id) + except: + print(left_gene.prot_id, right_gene.prot_id) + wA, uA, cA, dA = graph[left_gene][gene]["weight"], graph[left_gene][gene]["unidirectional"], graph[left_gene][gene]["convergent"], graph[left_gene][gene]["divergent"] + wB, uB, cB, dB = graph[gene][right_gene]["weight"], graph[gene][right_gene]["unidirectional"], graph[gene][right_gene]["convergent"], graph[gene][right_gene]["divergent"] + u, c, d = orient_reconnected_edges(wA, uA, cA, dA, wB, uB, cB, dB) + graph[left_gene][right_gene]['unidirectional'] = u + graph[left_gene][right_gene]['convergent'] = c + graph[left_gene][right_gene]['divergent'] = d # if gene that will be removed has already been reconnected: if (left_gene, gene) in reconnected_edge_to_real_edges: for e in reconnected_edge_to_real_edges[(left_gene, gene)]: @@ -127,7 +187,7 @@ def build_transitive_graph(graph, max_gap_length = 2): # the more likely ancestral genes belonging to HOGa and HOGb # were adjacent to each other in the ancestor of these genomes #################################################################################### -def leaves_to_root_synteny_propagation(ham, max_gaps): +def leaves_to_root_synteny_propagation(ham, max_gaps, orient_edges): print('###################################') print('Bottom-up_phase: propagate count of adjacencies observed between two genes from children nodes to their parent node on the species tree ' '(as long as the parent node has the two ancestral genes)') @@ -160,9 +220,10 @@ def leaves_to_root_synteny_propagation(ham, max_gaps): # to propagate the synteny graph to the higher level. # # The trimmed working copy is eventually stored as a transitive graph and will be used subsequently in the top-down phase - real_edge_to_transitive_edge = build_transitive_graph(child_working_graph, max_gaps) + real_edge_to_transitive_edge = build_transitive_graph(child_working_graph, orient_edges, max_gaps) child_transitive_graph = nx.Graph() - for gene, adjacent_gene, weight in child_working_graph.edges.data('weight', default=1): + for gene, adjacent_gene, edge_data in child_working_graph.edges.data(): + weight = edge_data['weight'] if(gene.parent is not None and adjacent_gene.parent is not None and gene.parent != adjacent_gene.parent): assert gene.parent in graph.nodes assert adjacent_gene.parent in graph.nodes @@ -178,9 +239,17 @@ def leaves_to_root_synteny_propagation(ham, max_gaps): graph[ancestral_edge[0]][ancestral_edge[1]]['weight'] += weight graph[ancestral_edge[0]][ancestral_edge[1]]['children'].append(child) graph[ancestral_edge[0]][ancestral_edge[1]]['extant_descendants'] += extant_descendants + if orient_edges: + graph[ancestral_edge[0]][ancestral_edge[1]]['unidirectional'] += edge_data['unidirectional'] + graph[ancestral_edge[0]][ancestral_edge[1]]['convergent'] += edge_data['convergent'] + graph[ancestral_edge[0]][ancestral_edge[1]]['divergent'] += edge_data['divergent'] else: # create edge otherwise, with the same weight as in current supporting child graph.add_edge(*ancestral_edge, weight = weight, children = [child], extant_descendants = extant_descendants) + if orient_edges: + graph[ancestral_edge[0]][ancestral_edge[1]]['unidirectional'] = edge_data['unidirectional'] + graph[ancestral_edge[0]][ancestral_edge[1]]['convergent'] = edge_data['convergent'] + graph[ancestral_edge[0]][ancestral_edge[1]]['divergent'] = edge_data['divergent'] if not gene.parent in child_transitive_graph.nodes: child_transitive_graph.add_node(gene.parent, real_gene = gene) @@ -188,8 +257,16 @@ def leaves_to_root_synteny_propagation(ham, max_gaps): child_transitive_graph.add_node(adjacent_gene.parent, real_gene = adjacent_gene) if child_transitive_graph.has_edge(*ancestral_edge): child_transitive_graph[gene.parent][adjacent_gene.parent]['weight'] += weight + if orient_edges: + child_transitive_graph[gene.parent][adjacent_gene.parent]['unidirectional'] += edge_data['unidirectional'] + child_transitive_graph[gene.parent][adjacent_gene.parent]['convergent'] += edge_data['convergent'] + child_transitive_graph[gene.parent][adjacent_gene.parent]['divergent'] += edge_data['divergent'] else: child_transitive_graph.add_edge(*ancestral_edge, weight = weight) + if orient_edges: + child_transitive_graph[gene.parent][adjacent_gene.parent]['unidirectional'] = edge_data['unidirectional'] + child_transitive_graph[gene.parent][adjacent_gene.parent]['convergent'] = edge_data['convergent'] + child_transitive_graph[gene.parent][adjacent_gene.parent]['divergent'] = edge_data['divergent'] child.add_feature('transitive_synteny', child_transitive_graph) child.add_feature('real_edge_to_transitive_edge', real_edge_to_transitive_edge) @@ -210,7 +287,7 @@ def leaves_to_root_synteny_propagation(ham, max_gaps): # if this adjacency is observed only in one child and not in the outgroup # (in this case, the LCA for this adjacency should be in the child lineage) #################################################################################### -def root_to_leaves_edge_trimming(ham): +def root_to_leaves_edge_trimming(ham, orient_edges): print('###################################') print('Top-down phase: prune any adjacency propagated before the last ancestor in which this adjacency is inferred to have emerged ') nb_ancestors = ham.taxonomy.tree.n_internal_nodes @@ -227,23 +304,42 @@ def root_to_leaves_edge_trimming(ham): edges_to_remove = list() graph = tree_node.bottom_up_synteny.copy() k = 0 - for gene, adjacent_gene, weight in graph.edges.data('weight', default=1): + for gene, adjacent_gene, edge_data in graph.edges.data(): + weight = edge_data['weight'] + if orient_edges: + initial_u = edge_data['unidirectional'] + initial_c = edge_data['convergent'] + initial_d = edge_data['divergent'] k += 1 remove_edge = True + if orient_edges: + u = c = d = 0 for child in tree_node.children: child_transitive_graph = child.transitive_synteny try: child_weight = child_transitive_graph[gene][adjacent_gene]['weight'] + if orient_edges: + child_u = child_transitive_graph[gene][adjacent_gene]['unidirectional'] + child_c = child_transitive_graph[gene][adjacent_gene]['convergent'] + child_d = child_transitive_graph[gene][adjacent_gene]['divergent'] except: - child_weight = 0 + child_weight = child_u = child_c = child_d = 0 # if at least two children contribute to the weight of the edge, # that means the edge was likely present in the current node if child_weight > 0 and child_weight < weight: remove_edge = False graph[gene][adjacent_gene]['nb_internal_nodes_from_ancestor_with_updated_weight'] = 0 - break + if orient_edges: + if child_u > 0 and child_u < initial_u: + u = initial_u + if child_c > 0 and child_c < initial_c: + c = initial_c + if child_d > 0 and child_d < initial_d: + d = initial_d + else: + break # if edge is only present in one children lineage, check if it is supported by outgroup (retain) or not (discard) - if remove_edge and tree_node.up: + if (remove_edge or orient_edges) and tree_node.up: try: # either there was some reconnection between genes because of insertion in descendant transitive_edge = tree_node.real_edge_to_transitive_edge[(gene, adjacent_gene)] @@ -267,8 +363,23 @@ def root_to_leaves_edge_trimming(ham): # if counter high: possible HGT or edge lost once and reemerged afterwards (convergent evolution) if 'nb_internal_nodes_from_ancestor_with_updated_weight' in ancestral_graph[transitive_edge[0]][transitive_edge[1]]: graph[gene][adjacent_gene]['nb_internal_nodes_from_ancestor_with_updated_weight'] += ancestral_graph[transitive_edge[0]][transitive_edge[1]]['nb_internal_nodes_from_ancestor_with_updated_weight'] + if orient_edges: + ancestral_u = ancestral_graph[transitive_edge[0]][transitive_edge[1]]['unidirectional'] + ancestral_c = ancestral_graph[transitive_edge[0]][transitive_edge[1]]['convergent'] + ancestral_d = ancestral_graph[transitive_edge[0]][transitive_edge[1]]['divergent'] + if ancestral_u > 0: + u = initial_u + if ancestral_c > 0: + c = initial_c + if ancestral_d > 0: + d = initial_d if remove_edge: edges_to_remove.append((gene, adjacent_gene)) + elif orient_edges: + if u + c + d == 0: + u = initial_u + c = initial_c + d = initial_d graph.remove_edges_from(edges_to_remove) tree_node.add_feature('top_down_synteny', graph) @@ -405,8 +516,4 @@ def linearize_graphs(ham): removed_edges.pop(idx) tree_node.add_feature('linear_synteny', graph) - - - - \ No newline at end of file diff --git a/edgehog/init_extant_synteny_graphs.py b/edgehog/init_extant_synteny_graphs.py index ef3dd0c..f5a56ea 100644 --- a/edgehog/init_extant_synteny_graphs.py +++ b/edgehog/init_extant_synteny_graphs.py @@ -79,10 +79,10 @@ def get_representative_hog_for_gene(ham, hogxml_entries): if best_hog: return best_hog return gene - + # Create a feature "synteny" that stores a synteny graph for the current extant genome -def assign_extant_synteny(genome, gene_dict, ham): +def assign_extant_synteny(genome, gene_dict, ham, orient_edges): graph = nx.Graph(genome = genome) contiguity_dict = dict() gene_keys = list(gene_dict.keys()) @@ -98,7 +98,19 @@ def assign_extant_synteny(genome, gene_dict, ham): graph.add_node(gene, **gene_dict[gene_keys[i]], contig = contig) if contig == old_contig: # connect genes only if they are on the same contig - graph.add_edge(old_gene, gene, weight = 1, children = [None], extant_descendants = [None]) + graph.add_edge(old_gene, gene, weight=1, unidirectional=0, convergent=0, divergent=0, children = [None], extant_descendants = [None]) + if orient_edges: + old_gene_strand = gene_dict[gene_keys[i-1]]['strand'] + gene_strand = gene_dict[gene_keys[i]]['strand'] + try: + if old_gene_strand == gene_strand: + graph[old_gene][gene]['unidirectional'] = 1 + elif old_gene_strand == "+": + graph[old_gene][gene]['convergent'] = 1 + elif old_gene_strand == "-": + graph[old_gene][gene]['divergent'] = 1 + except: + pass old_gene, old_contig = gene, contig # for cc in nx.connected_components(graph): # if len(cc > 1): @@ -109,18 +121,17 @@ def assign_extant_synteny(genome, gene_dict, ham): return graph, contiguity_dict -def gff_to_graph(ham, genome, gff, hogxml_entries, protein_id_to_hogxml_entry): +def gff_to_graph(ham, genome, gff, hogxml_entries, protein_id_to_hogxml_entry, orient_edges): gene_dict, protein_dict = gff_to_dict(gff) gene_dict = intersect_hogxml_and_gff(genome, hogxml_entries[genome], protein_id_to_hogxml_entry[genome], protein_dict, gene_dict) - graph, contiguity_dict = assign_extant_synteny(genome, gene_dict, ham) + graph, contiguity_dict = assign_extant_synteny(genome, gene_dict, ham, orient_edges) return graph, contiguity_dict -def init_extant_graphs(cpu, ham, hogs_file, gff_directory, hogxml_entries, protein_id_to_hogxml_entry): +def init_extant_graphs(cpu, ham, hogs_file, gff_directory, hogxml_entries, protein_id_to_hogxml_entry, orient_edges): print('###################################') print('Initializing synteny graphs of extant genomes ...') - ext_genomes = ham.get_list_extant_genomes() - + ext_genomes = ham.get_list_extant_genomes() genome_to_gff_dict = get_gffs_of_extant_genomes(gff_directory, ext_genomes) if cpu == 1: graphs = list() @@ -128,7 +139,7 @@ def init_extant_graphs(cpu, ham, hogs_file, gff_directory, hogxml_entries, prote i = 1 for genome in ext_genomes: print('processing extant genome %d/%d: %s' % (i, len(ext_genomes), genome.name)) - graph, contiguity_dict = gff_to_graph(ham, genome, genome_to_gff_dict[genome], hogxml_entries, protein_id_to_hogxml_entry) + graph, contiguity_dict = gff_to_graph(ham, genome, genome_to_gff_dict[genome], hogxml_entries, protein_id_to_hogxml_entry, orient_edges) graphs.append(graph) contiguity_dicts.append(contiguity_dict) i += 1 @@ -148,7 +159,7 @@ def init_extant_graphs(cpu, ham, hogs_file, gff_directory, hogxml_entries, prote return ham -def init_extant_graphs_from_hdf5(ham, hdf5_file): +def init_extant_graphs_from_hdf5(ham, hdf5_file, orient_edges): print('###################################') print('Initializing synteny graphs of extant genomes ...') from pyoma.browser import db @@ -161,6 +172,7 @@ def init_extant_graphs_from_hdf5(ham, hdf5_file): graph = nx.Graph() contiguity_dict = dict() old_gene = ham.get_gene_by_id(proteins[0]['EntryNr']) + old_gene_strand = proteins[0]['LocusStrand'] old_contig = proteins[0]['Chromosome'] graph.add_node(old_gene, contig = old_contig) for i in range(1, len(proteins)): @@ -168,7 +180,19 @@ def init_extant_graphs_from_hdf5(ham, hdf5_file): contig = proteins[i]['Chromosome'] graph.add_node(gene, contig = contig) if contig == old_contig: - graph.add_edge(gene, old_gene, weight=1, children = [None], extant_descendants = [None]) + graph.add_edge(gene, old_gene, weight=1, unidirectional=0, convergent=0, divergent=0, children = [None], extant_descendants = [None]) + if orient_edges: + old_gene_strand = proteins[i-1]['LocusStrand'] + gene_strand = proteins[i]['LocusStrand'] + try: + if old_gene_strand == gene_strand: + graph[old_gene][gene]['unidirectional'] = 1 + elif old_gene_strand == 1: + graph[old_gene][gene]['convergent'] = 1 + elif old_gene_strand == -1: + graph[old_gene][gene]['divergent'] = 1 + except: + pass old_gene, old_contig = gene, contig # for cc in nx.connected_components(graph): # if len(cc > 1): diff --git a/edgehog/write_output.py b/edgehog/write_output.py index 292e8e0..76b1db9 100644 --- a/edgehog/write_output.py +++ b/edgehog/write_output.py @@ -36,11 +36,13 @@ def label_nodes(graph): return label_dict, annotation_dict -def graph_to_df(graph, genome, edge_datation, label_dict=None, annotation_dict=None, include_extant_genes=False): +def graph_to_df(graph, genome, edge_datation, orient_edges, label_dict=None, annotation_dict=None, include_extant_genes=False): edge_dict = dict() singleton_dict = dict() column_names = ['gene1', 'gene2', 'weight', 'contiguous_region', 'nb_internal_nodes_from_ancestor_with_updated_weight', 'supporting_children'] + if orient_edges: + column_names += ['predicted_transcriptional_orientation', 'orientation_score'] if include_extant_genes: column_names += ['gene1_extant_annotations', 'gene2_extant_annotations'] if edge_datation: @@ -51,12 +53,20 @@ def graph_to_df(graph, genome, edge_datation, label_dict=None, annotation_dict=N car_counter = 0 for cc in nx.connected_components(graph): if len(cc) > 1: - for u, v, w in graph.subgraph(cc).edges.data("weight", default=1): + for u, v, edge_data in graph.subgraph(cc).edges.data(): + w = edge_data['weight'] edge_dict['contiguous_region'].append(car_counter) edge_dict['weight'].append(w) if isinstance(genome, pyham.AncestralGenome): edge_dict['gene1'].append(label_dict[u]) edge_dict['gene2'].append(label_dict[v]) + + if orient_edges: + # in case of equalities, here is the hierarchy: unidirectional > divergent > convergent + e_u, e_d, e_c = edge_data['unidirectional'], edge_data['divergent'], edge_data['convergent'] + edge_dict['predicted_transcriptional_orientation'].append(['unidirectional','divergent','convergent',][numpy.argmax([e_u, e_d, e_c])]) + edge_dict['orientation_score'].append(round(max(e_u,e_c,e_d))) + if include_extant_genes: edge_dict['gene1_extant_annotations'].append(annotation_dict[u]) edge_dict['gene2_extant_annotations'].append(annotation_dict[v]) @@ -71,6 +81,10 @@ def graph_to_df(graph, genome, edge_datation, label_dict=None, annotation_dict=N edge_dict['supporting_children'].append(None) edge_dict['gene1'].append(u.prot_id) edge_dict['gene2'].append(v.prot_id) + if orient_edges: + e_u, e_d, e_c = edge_data['unidirectional'], edge_data['divergent'], edge_data['convergent'] + edge_dict['predicted_transcriptional_orientation'].append(['unidirectional','divergent','convergent',][numpy.argmax([e_u, e_d, e_c])]) + edge_dict['orientation_score'].append(round(max(e_u,e_d,e_c))) if include_extant_genes: edge_dict['gene1_extant_annotations'].append(u.prot_id) edge_dict['gene2_extant_annotations'].append(v.prot_id) @@ -83,6 +97,9 @@ def graph_to_df(graph, genome, edge_datation, label_dict=None, annotation_dict=N u = list(cc)[0] for key in ['contiguous_region', 'weight', 'gene2', 'nb_internal_nodes_from_ancestor_with_updated_weight', 'supporting_children']: singleton_dict[key].append(None) + if orient_edges: + singleton_dict['predicted_transcriptional_orientation'].append(None) + singleton_dict['orientation_score'].append(None) if edge_datation: singleton_dict['predicted_edge_lca'].append(None) singleton_dict['predicted_edge_age_relative_to_root'].append(None) @@ -201,7 +218,7 @@ def write_output(args, ham, out_dir): if isinstance(genome, pyham.AncestralGenome): label_dict, annotation_dict = label_nodes(tree_node.bottom_up_synteny) - df = graph_to_df(tree_node.bottom_up_synteny, genome, False, label_dict, + df = graph_to_df(tree_node.bottom_up_synteny, genome, False, args.orient_edges, label_dict, annotation_dict, args.include_extant_genes) df.to_csv(os.path.join(out_dir, str(g) + '_bottom-up_synteny_graph_edges.tsv.gz'), compression='gzip', @@ -209,7 +226,7 @@ def write_output(args, ham, out_dir): header=True, sep='\t') - df = graph_to_df(tree_node.top_down_synteny, genome, args.date_edges, label_dict, + df = graph_to_df(tree_node.top_down_synteny, genome, args.date_edges, args.orient_edges, label_dict, annotation_dict, args.include_extant_genes) df.to_csv(os.path.join(out_dir, str(g) + '_top-down_synteny_graph_edges.tsv.gz'), compression='gzip', @@ -217,7 +234,7 @@ def write_output(args, ham, out_dir): header=True, sep='\t') - df = graph_to_df(tree_node.linear_synteny, genome, args.date_edges, label_dict, + df = graph_to_df(tree_node.linear_synteny, genome, args.date_edges, args.orient_edges, label_dict, annotation_dict, args.include_extant_genes) df.to_csv(os.path.join(out_dir, str(g) + '_linearized_synteny_graph_edges.tsv.gz'), compression='gzip', @@ -226,7 +243,7 @@ def write_output(args, ham, out_dir): sep='\t') else: # extant genome - df = graph_to_df(tree_node.bottom_up_synteny, genome, edge_datation=args.date_edges) + df = graph_to_df(tree_node.bottom_up_synteny, genome, args.date_edges, args.orient_edges) df.to_csv(os.path.join(out_dir, str(g) + '_extant_synteny_graph_edges.tsv.gz'), compression='gzip', index=False,