-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
363 changed files
with
10,271,280 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
__pycache__/ | ||
.python-version |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
function S = FINAL(A1, A2, N1, N2, E1, E2, H, alpha, maxiter, tol) | ||
% Description: | ||
% The algorithm is the generalized attributed network alignment algorithm. | ||
% The algorithm can handle the cases no matter node attributes and/or edge | ||
% attributes are given. If no node attributes or edge attributes are given, | ||
% then the corresponding input variable of the function is empty, e.g., | ||
% N1 = [], E1 = {}. | ||
% The algorithm can handle either numerical or categorical attributes | ||
% (feature vectors) for both edges and nodes. | ||
% | ||
% The algorithm uses cosine similarity to calculate node and edge feature | ||
% vector similarities. E.g., sim(v1, v2) = <v1, v2>/(||v1||_2*||v2||_2). | ||
% For categorical attributes, this is still equivalent to the indicator | ||
% function in the original published paper. | ||
% | ||
% Input: | ||
% A1, A2: Input adjacency matrices with n1, n2 nodes | ||
% N1, N2: Node attributes matrices, N1 is an n1*K matrix, N2 is an n2*K | ||
% matrix, each row is a node, and each column represents an | ||
% attribute. If the input node attributes are categorical, we can | ||
% use one hot encoding to represent each node feature as a vector. | ||
% And the input N1 and N2 are still n1*K and n2*K matrices. | ||
% E.g., for node attributes as countries, including USA, China, Canada, | ||
% if a user is from China, then his node feature is (0, 1, 0). | ||
% If N1 and N2 are emtpy, i.e., N1 = [], N2 = [], then no node | ||
% attributes are input. | ||
% | ||
% E1, E2: a L*1 cell, where E1{i} is the n1*n1 matrix and nonzero entry is | ||
% the i-th attribute of edges. E2{i} is same. Similarly, if the | ||
% input edge attributes are categorical, we can use one hot | ||
% encoding, i.e., E1{i}(a,b)=1 if edge (a,b) has categorical | ||
% attribute i. If E1 and E2 are empty, i.e., E1 = {} or [], E2 = {} | ||
% or [], then no edge attributes are input. | ||
% | ||
% H: a n2*n1 prior node similarity matrix, e.g., degree similarity. H | ||
% should be normalized, e.g., sum(sum(H)) = 1. | ||
% alpha: decay factor | ||
% maxiter, tol: maximum number of iterations and difference tolerance. | ||
% | ||
% Output: | ||
% S: an n2*n1 alignment matrix, entry (x,y) represents to what extend node- | ||
% x in A2 is aligned to node-y in A1 | ||
% | ||
% Reference: | ||
% Zhang, Si, and Hanghang Tong. "FINAL: Fast Attributed Network Alignment." | ||
% Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, 2016. | ||
|
||
n1 = size(A1, 1); n2 = size(A2, 1); | ||
|
||
% If no node attributes input, then initialize as a vector of 1 | ||
% so that all nodes are treated to have the save attributes which | ||
% is equivalent to no given node attribute. | ||
if isempty(N1) && isempty(N2) | ||
N1 = ones(n1, 1); | ||
N2 = ones(n2, 1); | ||
end | ||
|
||
% If no edge attributes are input, i.e., E1 and E2 are empty, then | ||
% initialize as a cell with 1 element, which is same as adjacency matrix | ||
% but the entries that are nonzero in adjacency matrix are equal to 1 so | ||
% that all edges are treated as with the same edge attributes. This is | ||
% equivalent to no given edge attributes. | ||
if isempty(E1) && isempty(E2) | ||
E1 = cell(1,1); E2 = cell(1,1); | ||
E1{1} = A1; E2{1} = A2; | ||
E1{1}(A1 > 0) = 1; E2{1}(A2 > 0) = 1; | ||
end | ||
|
||
K = size(N1, 2); L = size(E1, 2); | ||
T1 = spconvert([n1, n1, 0]); | ||
T2 = spconvert([n2, n2, 0]); | ||
|
||
% Normalize edge feature vectors | ||
|
||
for l = 1: L | ||
T1 = T1 + E1{l}.^2; % calculate ||v1||_2^2 | ||
T2 = T2 + E2{l}.^2; % calculate ||v2||_2^2 | ||
end | ||
T1 = spfun(@(x) 1./sqrt(x), T1); T2 = spfun(@(x) 1./sqrt(x), T2); | ||
for l = 1: L | ||
E1{l} = E1{l} .* T1; % normalize each entry by vector norm T1 | ||
E2{l} = E2{l} .* T2; % normalize each entry by vector norm T2 | ||
end | ||
|
||
% Normalize node feature vectors | ||
K1 = sum(N1.^2, 2).^(-0.5); K1(K1 == Inf) = 0; | ||
K2 = sum(N2.^2, 2).^(-0.5); K2(K2 == Inf) = 0; | ||
N1 = bsxfun(@times, K1, N1); % normalize the node attribute for A1 | ||
N2 = bsxfun(@times, K2, N2); % normalize the node attribute for A2 | ||
|
||
% Compute node feature cosine cross-similarity | ||
N = spconvert([n1*n2, 1, 0]); | ||
|
||
for k = 1: K | ||
N = N + kron(N1(:, k), N2(:, k)); % compute N as a kronecker similarity | ||
end | ||
|
||
% Compute the Kronecker degree vector | ||
d = spconvert([n1*n2, 1, 0]); | ||
tic; | ||
for l = 1: L | ||
for k = 1: K | ||
d = d + kron((E1{l} .* A1) * N1(:, k), (E2{l} .* A2) * N2(:,k)); | ||
end | ||
end | ||
fprintf('Time for degree: %.2f sec\n', toc); | ||
D = N .* d; DD = D.^(-0.5); | ||
DD(D == 0) = 0; % define inf to 0 | ||
|
||
% fixed-point solution | ||
q = DD .* N; | ||
h = H(:); s = h; | ||
|
||
for i = 1: maxiter | ||
fprintf('iteration %d\n', i); | ||
tic; | ||
prev = s; | ||
%TODO | ||
fprintf('size(N) is %s\n', mat2str(size(N))) | ||
fprintf('size(H) is %s\n', mat2str(size(H))) | ||
fprintf('size(s) is %s\n', mat2str(size(s))) | ||
fprintf('size(q) is %s\n', mat2str(size(q))) | ||
test = q.*s; | ||
M = reshape(q.*s, n2, n1); | ||
S = spconvert([n2, n1, 0]); | ||
for l = 1: L | ||
S = S + (E2{l} .* A2) * M * (E1{l} .* A1); % calculate the consistency part | ||
end | ||
s = (1 - alpha) * h + alpha * q .* S(:); % add the prior part | ||
diff = norm(s-prev); | ||
|
||
fprintf('Time for iteration %d: %.2f sec, diff = %.5f\n', i, toc, 100*diff); | ||
if diff < tol % if converge | ||
break; | ||
end | ||
end | ||
|
||
S = reshape(s, n2, n1); % reshape the similarity vector to a matrix |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
function [M, sim_matrix] = greedy_match(X) | ||
% greedy_match Computes a bipartite matching based on the scores in input | ||
% matrix | ||
% | ||
% Input arguments: | ||
% - X: the matrix with the similarity scores (similarity matrix). | ||
% Note that element X(i,j) is the similarity score of node i in B | ||
% and node j in A; if B has m nodes and A has n nodes then X is an | ||
% m x n matrix. | ||
% Output arguments: | ||
% - M: the sparse matrix with the matches: M(i,j) = 1.0 iff node i in B | ||
% matches with node j in A. m x n matrix (same dimensions with X) - | ||
% called also "matching matrix". | ||
% - dt: the time in seconds for the operation. | ||
|
||
[m, n] = size(X); | ||
N = m * n; | ||
t0 = clock; | ||
A=X | ||
x = X(:); | ||
clear X | ||
|
||
minSize = min(m, n); | ||
usedRows = zeros(m, 1); | ||
usedCols = zeros(n, 1); | ||
|
||
maxList = zeros(minSize, 1); | ||
row = zeros(minSize, 1); | ||
col = zeros(minSize, 1); | ||
|
||
[y, ix] = sort(x, 'descend'); | ||
% fprintf('matching stage 1\n'); | ||
% y = x(x~=0); | ||
% idx = find(x==0); | ||
% fprintf('matching stage 2\n'); | ||
% [~,id] = sort(x(x~=0), 'descend'); | ||
% ix = [id; idx]; | ||
% clear y | ||
|
||
matched = 1; | ||
index = 1; | ||
% fprintf('matching stage 3\n'); | ||
while (matched <= minSize) | ||
% fprintf('\t\t matched = %d\n',matched); | ||
% fprintf('\t\t index = %d\n',index); | ||
|
||
ipos = ix(index); % position in the original vectorized matrix | ||
jc = ceil(ipos / m); | ||
ic = ipos - (jc - 1) * m; | ||
if ic == 0, ic = 1; end | ||
if (usedRows(ic) ~= 1 && usedCols(jc) ~= 1) | ||
matched; | ||
row(matched) = ic; | ||
col(matched) = jc; | ||
maxList(matched) = x(index); | ||
usedRows(ic) = 1; | ||
usedCols(jc) = 1; | ||
|
||
matched = matched + 1; | ||
end | ||
|
||
index = index + 1; | ||
end | ||
data = ones(minSize, 1); | ||
|
||
M = sparse(row, col, data, m, n); | ||
|
||
sim_matrix = transpose(A); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import numpy as np | ||
|
||
class Graph(object): | ||
''' Note: adj_list shows each edge twice. So edge_num is really two times of edge number for undirected graph.''' | ||
|
||
def __init__(self, node_num, edge_num): | ||
self.node_num = node_num # n | ||
self.edge_num = edge_num # m | ||
self.adj_list = np.zeros(edge_num, dtype=np.int32) - 1 # a big array for all the neighbors. | ||
self.adj_idx = np.zeros(node_num + 1, | ||
dtype=np.int32) # idx of the beginning neighbors in the adj_list. Pad one additional element at the end with value equal to the edge_num, i.e., self.adj_idx[-1] = edge_num | ||
self.adj_wgt = np.zeros(edge_num, | ||
dtype=np.float32) # same dimension as adj_list, wgt on the edge. CAN be float numbers. | ||
self.node_wgt = np.zeros(node_num, dtype=np.float32) | ||
self.cmap = np.zeros(node_num, dtype=np.int32) - 1 # mapped to coarser graph | ||
|
||
# weighted degree: the sum of the adjacency weight of each vertex, including self-loop. | ||
self.degree = np.zeros(node_num, dtype=np.float32) | ||
self.A = None | ||
self.C = None # Matching Matrix | ||
|
||
self.coarser = None | ||
self.finer = None | ||
|
||
def resize_adj(self, edge_num): | ||
'''Resize the adjacency list/wgts based on the number of edges.''' | ||
self.adj_list = np.resize(self.adj_list, edge_num) | ||
self.adj_wgt = np.resize(self.adj_wgt, edge_num) | ||
|
||
def get_neighs(self, idx): | ||
'''obtain the list of neigbors given a node.''' | ||
istart = self.adj_idx[idx] | ||
iend = self.adj_idx[idx + 1] | ||
return self.adj_list[istart:iend] | ||
|
||
def get_neigh_edge_wgts(self, idx): | ||
'''obtain the weights of neighbors given a node.''' | ||
istart = self.adj_idx[idx] | ||
iend = self.adj_idx[idx + 1] | ||
return self.adj_wgt[istart:iend] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
from collections import defaultdict | ||
from graph import Graph | ||
import numpy as np | ||
from utils import cmap2C | ||
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | ||
from utils import from_alignment_edgelist_to_graph, create_coarse_graph | ||
import importlib | ||
import logging | ||
import numpy as np | ||
import pdb | ||
import os | ||
import time | ||
import pickle | ||
|
||
#### Credit: codes partially borrowed from MILE: https://github.com/jiongqian/MILE | ||
#### Also cited in paper | ||
|
||
def parse_args(): | ||
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter, | ||
conflict_handler='resolve') | ||
parser.add_argument('--data', required=True, help='Input graph file') | ||
parser.add_argument('--coarsen-level', default=2, type=int, help='MAX number of levels of coarsening.') | ||
parser.add_argument('--workers', default=4, type=int, help='Number of workers.') | ||
parser.add_argument('--output-path', required=True, help='Path to save the output pickle file') | ||
args = parser.parse_args() | ||
return args | ||
|
||
def normalized_adj_wgt(graph): | ||
adj_wgt = graph.adj_wgt | ||
adj_idx = graph.adj_idx | ||
norm_wgt = np.zeros(adj_wgt.shape, dtype=np.float32) | ||
degree = graph.degree | ||
for i in range(graph.node_num): | ||
for j in range(adj_idx[i], adj_idx[i + 1]): | ||
neigh = graph.adj_list[j] | ||
norm_wgt[j] = adj_wgt[neigh] / np.sqrt(degree[i] * degree[neigh]) | ||
return norm_wgt | ||
|
||
def normalized_heavy_edge_matching(args, graph): | ||
'''Generate matchings using the hybrid method. It changes the cmap in graph object, | ||
return groups array and coarse_graph_size.''' | ||
node_num = graph.node_num | ||
adj_list = graph.adj_list # big array for neighbors. | ||
adj_idx = graph.adj_idx # beginning idx of neighbors. | ||
adj_wgt = graph.adj_wgt # weight on edge | ||
node_wgt = graph.node_wgt # weight on node | ||
cmap = graph.cmap | ||
norm_adj_wgt = normalized_adj_wgt(graph) | ||
coarsen_to = max(1, graph.node_num // (2 ** args.coarsen_level)) # rough estimation. | ||
max_node_wgt = int((5.0 * graph.node_num) / coarsen_to) | ||
|
||
groups = [] # a list of groups, each group corresponding to one coarse node. | ||
matched = [False] * node_num | ||
|
||
degree = [adj_idx[i + 1] - adj_idx[i] for i in range(0, node_num)] | ||
sorted_idx = np.argsort(degree) | ||
for idx in sorted_idx: | ||
if matched[idx]: | ||
continue | ||
max_idx = idx | ||
max_wgt = -1 | ||
for j in range(adj_idx[idx], adj_idx[idx + 1]): | ||
neigh = adj_list[j] | ||
if neigh == idx: # KEY: exclude self-loop. Otherwise, mostly matching with itself. | ||
continue | ||
curr_wgt = norm_adj_wgt[j] | ||
if ((not matched[neigh]) and max_wgt < curr_wgt and node_wgt[idx] + node_wgt[neigh] <= max_node_wgt): | ||
max_idx = neigh | ||
max_wgt = curr_wgt | ||
# it might happen that max_idx is idx, which means cannot find a match for the node. | ||
matched[idx] = matched[max_idx] = True | ||
if idx == max_idx: | ||
groups.append([idx]) | ||
else: | ||
groups.append([idx, max_idx]) | ||
coarse_graph_size = 0 | ||
for idx in range(len(groups)): | ||
for ele in groups[idx]: | ||
cmap[ele] = coarse_graph_size | ||
coarse_graph_size += 1 | ||
return (groups, coarse_graph_size) | ||
|
||
def multilevel_embed(args, graph): | ||
'''This method defines the multilevel embedding method.''' | ||
start = time.time() | ||
|
||
# Step-1: Graph Coarsening. | ||
original_graph = graph | ||
coarsen_level = args.coarsen_level | ||
graphs = [] | ||
graphs.append(graph) | ||
for i in range(coarsen_level): | ||
match, coarse_graph_size = normalized_heavy_edge_matching(args, graph) | ||
coarse_graph = create_coarse_graph(graph, match, coarse_graph_size) | ||
graph = coarse_graph | ||
graphs.append(graph) | ||
|
||
return graphs | ||
|
||
if __name__ == "__main__": | ||
before_emb = time.time() | ||
seed = 123 | ||
np.random.seed(seed) | ||
args = parse_args() | ||
|
||
input_graph_path = args.data | ||
graphs, mapping = from_alignment_edgelist_to_graph(input_graph_path) | ||
labels = None | ||
embed_As = [] | ||
embed_Cs = [] | ||
max_degree = 0 | ||
for graph in graphs: | ||
As = [] | ||
Cs = [] | ||
# Generate multilevel projects | ||
graph_embeds = multilevel_embed(args, graph) | ||
for i in range(args.coarsen_level): | ||
As.append(graph_embeds[i].A) | ||
Cs.append(graph_embeds[i].C) | ||
embed_As.append(As) | ||
embed_Cs.append(Cs) | ||
to_save = {'A_list': embed_As, 'matches': embed_Cs, 'labels': labels} | ||
pickle_out = open(args.output_path, "wb") | ||
pickle.dump(to_save, pickle_out) | ||
pickle_out.close() | ||
after_emb = time.time() | ||
total_time = after_emb - before_emb | ||
print(("time for embed (in seconds): %f" % total_time)) | ||
print("MILE completed") |
Binary file not shown.
Oops, something went wrong.