Skip to content

Commit

Permalink
code update
Browse files Browse the repository at this point in the history
  • Loading branch information
jwzhi committed Aug 13, 2022
1 parent f6cde07 commit 857d57b
Show file tree
Hide file tree
Showing 363 changed files with 10,271,280 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__/
.python-version
138 changes: 138 additions & 0 deletions FINAL/matlab/FINAL.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
function S = FINAL(A1, A2, N1, N2, E1, E2, H, alpha, maxiter, tol)
% Description:
% The algorithm is the generalized attributed network alignment algorithm.
% The algorithm can handle the cases no matter node attributes and/or edge
% attributes are given. If no node attributes or edge attributes are given,
% then the corresponding input variable of the function is empty, e.g.,
% N1 = [], E1 = {}.
% The algorithm can handle either numerical or categorical attributes
% (feature vectors) for both edges and nodes.
%
% The algorithm uses cosine similarity to calculate node and edge feature
% vector similarities. E.g., sim(v1, v2) = <v1, v2>/(||v1||_2*||v2||_2).
% For categorical attributes, this is still equivalent to the indicator
% function in the original published paper.
%
% Input:
% A1, A2: Input adjacency matrices with n1, n2 nodes
% N1, N2: Node attributes matrices, N1 is an n1*K matrix, N2 is an n2*K
% matrix, each row is a node, and each column represents an
% attribute. If the input node attributes are categorical, we can
% use one hot encoding to represent each node feature as a vector.
% And the input N1 and N2 are still n1*K and n2*K matrices.
% E.g., for node attributes as countries, including USA, China, Canada,
% if a user is from China, then his node feature is (0, 1, 0).
% If N1 and N2 are emtpy, i.e., N1 = [], N2 = [], then no node
% attributes are input.
%
% E1, E2: a L*1 cell, where E1{i} is the n1*n1 matrix and nonzero entry is
% the i-th attribute of edges. E2{i} is same. Similarly, if the
% input edge attributes are categorical, we can use one hot
% encoding, i.e., E1{i}(a,b)=1 if edge (a,b) has categorical
% attribute i. If E1 and E2 are empty, i.e., E1 = {} or [], E2 = {}
% or [], then no edge attributes are input.
%
% H: a n2*n1 prior node similarity matrix, e.g., degree similarity. H
% should be normalized, e.g., sum(sum(H)) = 1.
% alpha: decay factor
% maxiter, tol: maximum number of iterations and difference tolerance.
%
% Output:
% S: an n2*n1 alignment matrix, entry (x,y) represents to what extend node-
% x in A2 is aligned to node-y in A1
%
% Reference:
% Zhang, Si, and Hanghang Tong. "FINAL: Fast Attributed Network Alignment."
% Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, 2016.

n1 = size(A1, 1); n2 = size(A2, 1);

% If no node attributes input, then initialize as a vector of 1
% so that all nodes are treated to have the save attributes which
% is equivalent to no given node attribute.
if isempty(N1) && isempty(N2)
N1 = ones(n1, 1);
N2 = ones(n2, 1);
end

% If no edge attributes are input, i.e., E1 and E2 are empty, then
% initialize as a cell with 1 element, which is same as adjacency matrix
% but the entries that are nonzero in adjacency matrix are equal to 1 so
% that all edges are treated as with the same edge attributes. This is
% equivalent to no given edge attributes.
if isempty(E1) && isempty(E2)
E1 = cell(1,1); E2 = cell(1,1);
E1{1} = A1; E2{1} = A2;
E1{1}(A1 > 0) = 1; E2{1}(A2 > 0) = 1;
end

K = size(N1, 2); L = size(E1, 2);
T1 = spconvert([n1, n1, 0]);
T2 = spconvert([n2, n2, 0]);

% Normalize edge feature vectors

for l = 1: L
T1 = T1 + E1{l}.^2; % calculate ||v1||_2^2
T2 = T2 + E2{l}.^2; % calculate ||v2||_2^2
end
T1 = spfun(@(x) 1./sqrt(x), T1); T2 = spfun(@(x) 1./sqrt(x), T2);
for l = 1: L
E1{l} = E1{l} .* T1; % normalize each entry by vector norm T1
E2{l} = E2{l} .* T2; % normalize each entry by vector norm T2
end

% Normalize node feature vectors
K1 = sum(N1.^2, 2).^(-0.5); K1(K1 == Inf) = 0;
K2 = sum(N2.^2, 2).^(-0.5); K2(K2 == Inf) = 0;
N1 = bsxfun(@times, K1, N1); % normalize the node attribute for A1
N2 = bsxfun(@times, K2, N2); % normalize the node attribute for A2

% Compute node feature cosine cross-similarity
N = spconvert([n1*n2, 1, 0]);

for k = 1: K
N = N + kron(N1(:, k), N2(:, k)); % compute N as a kronecker similarity
end

% Compute the Kronecker degree vector
d = spconvert([n1*n2, 1, 0]);
tic;
for l = 1: L
for k = 1: K
d = d + kron((E1{l} .* A1) * N1(:, k), (E2{l} .* A2) * N2(:,k));
end
end
fprintf('Time for degree: %.2f sec\n', toc);
D = N .* d; DD = D.^(-0.5);
DD(D == 0) = 0; % define inf to 0

% fixed-point solution
q = DD .* N;
h = H(:); s = h;

for i = 1: maxiter
fprintf('iteration %d\n', i);
tic;
prev = s;
%TODO
fprintf('size(N) is %s\n', mat2str(size(N)))
fprintf('size(H) is %s\n', mat2str(size(H)))
fprintf('size(s) is %s\n', mat2str(size(s)))
fprintf('size(q) is %s\n', mat2str(size(q)))
test = q.*s;
M = reshape(q.*s, n2, n1);
S = spconvert([n2, n1, 0]);
for l = 1: L
S = S + (E2{l} .* A2) * M * (E1{l} .* A1); % calculate the consistency part
end
s = (1 - alpha) * h + alpha * q .* S(:); % add the prior part
diff = norm(s-prev);

fprintf('Time for iteration %d: %.2f sec, diff = %.5f\n', i, toc, 100*diff);
if diff < tol % if converge
break;
end
end

S = reshape(s, n2, n1); % reshape the similarity vector to a matrix
68 changes: 68 additions & 0 deletions FINAL/matlab/greedy_match.m
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
function [M, sim_matrix] = greedy_match(X)
% greedy_match Computes a bipartite matching based on the scores in input
% matrix
%
% Input arguments:
% - X: the matrix with the similarity scores (similarity matrix).
% Note that element X(i,j) is the similarity score of node i in B
% and node j in A; if B has m nodes and A has n nodes then X is an
% m x n matrix.
% Output arguments:
% - M: the sparse matrix with the matches: M(i,j) = 1.0 iff node i in B
% matches with node j in A. m x n matrix (same dimensions with X) -
% called also "matching matrix".
% - dt: the time in seconds for the operation.

[m, n] = size(X);
N = m * n;
t0 = clock;
A=X
x = X(:);
clear X

minSize = min(m, n);
usedRows = zeros(m, 1);
usedCols = zeros(n, 1);

maxList = zeros(minSize, 1);
row = zeros(minSize, 1);
col = zeros(minSize, 1);

[y, ix] = sort(x, 'descend');
% fprintf('matching stage 1\n');
% y = x(x~=0);
% idx = find(x==0);
% fprintf('matching stage 2\n');
% [~,id] = sort(x(x~=0), 'descend');
% ix = [id; idx];
% clear y

matched = 1;
index = 1;
% fprintf('matching stage 3\n');
while (matched <= minSize)
% fprintf('\t\t matched = %d\n',matched);
% fprintf('\t\t index = %d\n',index);

ipos = ix(index); % position in the original vectorized matrix
jc = ceil(ipos / m);
ic = ipos - (jc - 1) * m;
if ic == 0, ic = 1; end
if (usedRows(ic) ~= 1 && usedCols(jc) ~= 1)
matched;
row(matched) = ic;
col(matched) = jc;
maxList(matched) = x(index);
usedRows(ic) = 1;
usedCols(jc) = 1;

matched = matched + 1;
end

index = index + 1;
end
data = ones(minSize, 1);

M = sparse(row, col, data, m, n);

sim_matrix = transpose(A);
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,7 @@ A General Multilevel Framework for Network Alignment
<p align="center">
<img src="https://raw.githubusercontent.com/GemsLab/CAPER/master/approach.png" width="700" alt="CAPER overview">
</p>

`python3 nhem.py --data ../data/arenas/arenas800-3/arenas_combined_edges.txt --coarsen-level 3 --output-path test.pkl`

`python main.py --true_align data/arenas/arenas800-3/arenas_edges-mapping-permutation.txt --combined_graph coarsening/test.pkl --embmethod xnetMF --alignmethod REGAL --refinemethod RefiNA --coarsen`
40 changes: 40 additions & 0 deletions coarsening/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import numpy as np

class Graph(object):
''' Note: adj_list shows each edge twice. So edge_num is really two times of edge number for undirected graph.'''

def __init__(self, node_num, edge_num):
self.node_num = node_num # n
self.edge_num = edge_num # m
self.adj_list = np.zeros(edge_num, dtype=np.int32) - 1 # a big array for all the neighbors.
self.adj_idx = np.zeros(node_num + 1,
dtype=np.int32) # idx of the beginning neighbors in the adj_list. Pad one additional element at the end with value equal to the edge_num, i.e., self.adj_idx[-1] = edge_num
self.adj_wgt = np.zeros(edge_num,
dtype=np.float32) # same dimension as adj_list, wgt on the edge. CAN be float numbers.
self.node_wgt = np.zeros(node_num, dtype=np.float32)
self.cmap = np.zeros(node_num, dtype=np.int32) - 1 # mapped to coarser graph

# weighted degree: the sum of the adjacency weight of each vertex, including self-loop.
self.degree = np.zeros(node_num, dtype=np.float32)
self.A = None
self.C = None # Matching Matrix

self.coarser = None
self.finer = None

def resize_adj(self, edge_num):
'''Resize the adjacency list/wgts based on the number of edges.'''
self.adj_list = np.resize(self.adj_list, edge_num)
self.adj_wgt = np.resize(self.adj_wgt, edge_num)

def get_neighs(self, idx):
'''obtain the list of neigbors given a node.'''
istart = self.adj_idx[idx]
iend = self.adj_idx[idx + 1]
return self.adj_list[istart:iend]

def get_neigh_edge_wgts(self, idx):
'''obtain the weights of neighbors given a node.'''
istart = self.adj_idx[idx]
iend = self.adj_idx[idx + 1]
return self.adj_wgt[istart:iend]
129 changes: 129 additions & 0 deletions coarsening/nhem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
from collections import defaultdict
from graph import Graph
import numpy as np
from utils import cmap2C
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from utils import from_alignment_edgelist_to_graph, create_coarse_graph
import importlib
import logging
import numpy as np
import pdb
import os
import time
import pickle

#### Credit: codes partially borrowed from MILE: https://github.com/jiongqian/MILE
#### Also cited in paper

def parse_args():
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument('--data', required=True, help='Input graph file')
parser.add_argument('--coarsen-level', default=2, type=int, help='MAX number of levels of coarsening.')
parser.add_argument('--workers', default=4, type=int, help='Number of workers.')
parser.add_argument('--output-path', required=True, help='Path to save the output pickle file')
args = parser.parse_args()
return args

def normalized_adj_wgt(graph):
adj_wgt = graph.adj_wgt
adj_idx = graph.adj_idx
norm_wgt = np.zeros(adj_wgt.shape, dtype=np.float32)
degree = graph.degree
for i in range(graph.node_num):
for j in range(adj_idx[i], adj_idx[i + 1]):
neigh = graph.adj_list[j]
norm_wgt[j] = adj_wgt[neigh] / np.sqrt(degree[i] * degree[neigh])
return norm_wgt

def normalized_heavy_edge_matching(args, graph):
'''Generate matchings using the hybrid method. It changes the cmap in graph object,
return groups array and coarse_graph_size.'''
node_num = graph.node_num
adj_list = graph.adj_list # big array for neighbors.
adj_idx = graph.adj_idx # beginning idx of neighbors.
adj_wgt = graph.adj_wgt # weight on edge
node_wgt = graph.node_wgt # weight on node
cmap = graph.cmap
norm_adj_wgt = normalized_adj_wgt(graph)
coarsen_to = max(1, graph.node_num // (2 ** args.coarsen_level)) # rough estimation.
max_node_wgt = int((5.0 * graph.node_num) / coarsen_to)

groups = [] # a list of groups, each group corresponding to one coarse node.
matched = [False] * node_num

degree = [adj_idx[i + 1] - adj_idx[i] for i in range(0, node_num)]
sorted_idx = np.argsort(degree)
for idx in sorted_idx:
if matched[idx]:
continue
max_idx = idx
max_wgt = -1
for j in range(adj_idx[idx], adj_idx[idx + 1]):
neigh = adj_list[j]
if neigh == idx: # KEY: exclude self-loop. Otherwise, mostly matching with itself.
continue
curr_wgt = norm_adj_wgt[j]
if ((not matched[neigh]) and max_wgt < curr_wgt and node_wgt[idx] + node_wgt[neigh] <= max_node_wgt):
max_idx = neigh
max_wgt = curr_wgt
# it might happen that max_idx is idx, which means cannot find a match for the node.
matched[idx] = matched[max_idx] = True
if idx == max_idx:
groups.append([idx])
else:
groups.append([idx, max_idx])
coarse_graph_size = 0
for idx in range(len(groups)):
for ele in groups[idx]:
cmap[ele] = coarse_graph_size
coarse_graph_size += 1
return (groups, coarse_graph_size)

def multilevel_embed(args, graph):
'''This method defines the multilevel embedding method.'''
start = time.time()

# Step-1: Graph Coarsening.
original_graph = graph
coarsen_level = args.coarsen_level
graphs = []
graphs.append(graph)
for i in range(coarsen_level):
match, coarse_graph_size = normalized_heavy_edge_matching(args, graph)
coarse_graph = create_coarse_graph(graph, match, coarse_graph_size)
graph = coarse_graph
graphs.append(graph)

return graphs

if __name__ == "__main__":
before_emb = time.time()
seed = 123
np.random.seed(seed)
args = parse_args()

input_graph_path = args.data
graphs, mapping = from_alignment_edgelist_to_graph(input_graph_path)
labels = None
embed_As = []
embed_Cs = []
max_degree = 0
for graph in graphs:
As = []
Cs = []
# Generate multilevel projects
graph_embeds = multilevel_embed(args, graph)
for i in range(args.coarsen_level):
As.append(graph_embeds[i].A)
Cs.append(graph_embeds[i].C)
embed_As.append(As)
embed_Cs.append(Cs)
to_save = {'A_list': embed_As, 'matches': embed_Cs, 'labels': labels}
pickle_out = open(args.output_path, "wb")
pickle.dump(to_save, pickle_out)
pickle_out.close()
after_emb = time.time()
total_time = after_emb - before_emb
print(("time for embed (in seconds): %f" % total_time))
print("MILE completed")
Binary file added coarsening/test.pkl
Binary file not shown.
Loading

0 comments on commit 857d57b

Please sign in to comment.