code update

GemsLab · Aug 13, 2022 · 857d57b · 857d57b
1 parent f6cde07
commit 857d57b
Show file tree

Hide file tree

Showing 363 changed files with 10,271,280 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+.python-version
diff --git a/FINAL/matlab/FINAL.m b/FINAL/matlab/FINAL.m
@@ -0,0 +1,138 @@
+function S = FINAL(A1, A2, N1, N2, E1, E2, H, alpha, maxiter, tol)
+% Description:
+%   The algorithm is the generalized attributed network alignment algorithm.
+%   The algorithm can handle the cases no matter node attributes and/or edge
+%   attributes are given. If no node attributes or edge attributes are given,
+%   then the corresponding input variable of the function is empty, e.g.,
+%   N1 = [], E1 = {}.
+%   The algorithm can handle either numerical or categorical attributes
+%   (feature vectors) for both edges and nodes.
+%
+%   The algorithm uses cosine similarity to calculate node and edge feature
+%   vector similarities. E.g., sim(v1, v2) = <v1, v2>/(||v1||_2*||v2||_2).
+%   For categorical attributes, this is still equivalent to the indicator
+%   function in the original published paper.
+%
+% Input:
+%   A1, A2: Input adjacency matrices with n1, n2 nodes
+%   N1, N2: Node attributes matrices, N1 is an n1*K matrix, N2 is an n2*K
+%         matrix, each row is a node, and each column represents an
+%         attribute. If the input node attributes are categorical, we can
+%         use one hot encoding to represent each node feature as a vector.
+%         And the input N1 and N2 are still n1*K and n2*K matrices.
+%         E.g., for node attributes as countries, including USA, China, Canada,
+%         if a user is from China, then his node feature is (0, 1, 0).
+%         If N1 and N2 are emtpy, i.e., N1 = [], N2 = [], then no node
+%         attributes are input.
+%
+%   E1, E2: a L*1 cell, where E1{i} is the n1*n1 matrix and nonzero entry is
+%         the i-th attribute of edges. E2{i} is same. Similarly,  if the
+%         input edge attributes are categorical, we can use one hot
+%         encoding, i.e., E1{i}(a,b)=1 if edge (a,b) has categorical
+%         attribute i. If E1 and E2 are empty, i.e., E1 = {} or [], E2 = {}
+%         or [], then no edge attributes are input.
+%
+%   H: a n2*n1 prior node similarity matrix, e.g., degree similarity. H
+%      should be normalized, e.g., sum(sum(H)) = 1.
+%   alpha: decay factor
+%   maxiter, tol: maximum number of iterations and difference tolerance.
+%
+% Output:
+%   S: an n2*n1 alignment matrix, entry (x,y) represents to what extend node-
+%    x in A2 is aligned to node-y in A1
+%
+% Reference:
+%   Zhang, Si, and Hanghang Tong. "FINAL: Fast Attributed Network Alignment."
+%   Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM, 2016.
+
+n1 = size(A1, 1); n2 = size(A2, 1);
+
+% If no node attributes input, then initialize as a vector of 1
+% so that all nodes are treated to have the save attributes which
+% is equivalent to no given node attribute.
+if isempty(N1) && isempty(N2)
+    N1 = ones(n1, 1);
+    N2 = ones(n2, 1);
+end
+
+% If no edge attributes are input, i.e., E1 and E2 are empty, then
+% initialize as a cell with 1 element, which is same as adjacency matrix
+% but the entries that are nonzero in adjacency matrix are equal to 1 so
+% that all edges are treated as with the same edge attributes. This is
+% equivalent to no given edge attributes.
+if isempty(E1) && isempty(E2)
+    E1 = cell(1,1); E2 = cell(1,1);
+    E1{1} = A1; E2{1} = A2;
+    E1{1}(A1 > 0) = 1; E2{1}(A2 > 0) = 1;
+end
+
+K = size(N1, 2); L = size(E1, 2);
+T1 = spconvert([n1, n1, 0]);
+T2 = spconvert([n2, n2, 0]);
+
+% Normalize edge feature vectors
+
+for l = 1: L
+    T1 = T1 + E1{l}.^2; % calculate ||v1||_2^2
+    T2 = T2 + E2{l}.^2; % calculate ||v2||_2^2
+end
+T1 = spfun(@(x) 1./sqrt(x), T1); T2 = spfun(@(x) 1./sqrt(x), T2);
+for l = 1: L
+   E1{l} = E1{l} .* T1; % normalize each entry by vector norm T1
+   E2{l} = E2{l} .* T2; % normalize each entry by vector norm T2
+end
+
+% Normalize node feature vectors
+K1 = sum(N1.^2, 2).^(-0.5); K1(K1 == Inf) = 0;
+K2 = sum(N2.^2, 2).^(-0.5); K2(K2 == Inf) = 0;
+N1 = bsxfun(@times, K1, N1); % normalize the node attribute for A1
+N2 = bsxfun(@times, K2, N2); % normalize the node attribute for A2
+
+% Compute node feature cosine cross-similarity
+N = spconvert([n1*n2, 1, 0]);
+
+for k = 1: K
+    N = N + kron(N1(:, k), N2(:, k));   % compute N as a kronecker similarity
+end
+
+% Compute the Kronecker degree vector
+d = spconvert([n1*n2, 1, 0]);
+tic;
+for l = 1: L
+    for k = 1: K
+        d = d + kron((E1{l} .* A1) * N1(:, k), (E2{l} .* A2) * N2(:,k));
+    end
+end
+fprintf('Time for degree: %.2f sec\n', toc);
+D = N .* d; DD = D.^(-0.5);
+DD(D == 0) = 0;     % define inf to 0
+
+% fixed-point solution
+q = DD .* N;
+h = H(:); s = h;
+
+for i = 1: maxiter
+    fprintf('iteration %d\n', i);
+    tic;
+    prev = s;
+    %TODO
+    fprintf('size(N) is %s\n', mat2str(size(N)))
+    fprintf('size(H) is %s\n', mat2str(size(H)))
+    fprintf('size(s) is %s\n', mat2str(size(s)))
+    fprintf('size(q) is %s\n', mat2str(size(q)))
+    test = q.*s;
+    M = reshape(q.*s, n2, n1);
+    S = spconvert([n2, n1, 0]);
+    for l = 1: L
+        S = S + (E2{l} .* A2) * M * (E1{l} .* A1);    % calculate the consistency part
+    end
+    s = (1 - alpha) * h + alpha * q .* S(:);   % add the prior part
+    diff = norm(s-prev);
+
+    fprintf('Time for iteration %d: %.2f sec, diff = %.5f\n', i, toc, 100*diff);
+    if diff < tol   % if converge
+        break;
+    end
+end
+
+S = reshape(s, n2, n1);   % reshape the similarity vector to a matrix
diff --git a/FINAL/matlab/greedy_match.m b/FINAL/matlab/greedy_match.m
@@ -0,0 +1,68 @@
+function [M, sim_matrix] = greedy_match(X)
+% greedy_match Computes a bipartite matching based on the scores in input
+% matrix
+%
+% Input arguments:
+% - X: the matrix with the similarity scores (similarity matrix).
+%     Note that element X(i,j) is the similarity score of node i in B
+%     and node j in A; if B has m nodes and A has n nodes then X is an
+%     m x n matrix.
+% Output arguments:
+% - M: the sparse matrix with the matches: M(i,j) = 1.0 iff node i in B
+%     matches with node j in A. m x n matrix (same dimensions with X) -
+%     called also "matching matrix".
+% - dt: the time in seconds for the operation.
+
+[m, n] = size(X);
+N = m * n;
+t0 = clock;
+A=X
+x = X(:);
+clear X
+
+minSize = min(m, n);
+usedRows = zeros(m, 1);
+usedCols = zeros(n, 1);
+
+maxList = zeros(minSize, 1);
+row = zeros(minSize, 1);
+col = zeros(minSize, 1);
+
+[y, ix] = sort(x, 'descend');
+% fprintf('matching stage 1\n');
+% y = x(x~=0);
+% idx = find(x==0);
+% fprintf('matching stage 2\n');
+% [~,id] = sort(x(x~=0), 'descend');
+% ix = [id; idx];
+% clear y
+
+matched = 1;
+index = 1;
+% fprintf('matching stage 3\n');
+while (matched <= minSize)
+%     fprintf('\t\t matched = %d\n',matched);
+%     fprintf('\t\t index = %d\n',index);
+
+    ipos = ix(index); % position in the original vectorized matrix
+    jc = ceil(ipos / m);
+    ic = ipos - (jc - 1) * m;
+    if ic == 0, ic = 1; end
+    if (usedRows(ic) ~= 1 && usedCols(jc) ~= 1)
+        matched;
+        row(matched) = ic;
+        col(matched) = jc;
+		maxList(matched) = x(index);
+		usedRows(ic) = 1;
+		usedCols(jc) = 1;
+
+		matched = matched + 1;
+    end
+
+    index = index + 1;
+end
+data = ones(minSize, 1);
+
+M = sparse(row, col, data, m, n);
+
+sim_matrix = transpose(A);
diff --git a/README.md b/README.md
@@ -9,3 +9,7 @@ A General Multilevel Framework for Network Alignment
 <p align="center">
 <img src="https://raw.githubusercontent.com/GemsLab/CAPER/master/approach.png" width="700"  alt="CAPER overview">
 </p>
+
+`python3 nhem.py --data ../data/arenas/arenas800-3/arenas_combined_edges.txt --coarsen-level 3 --output-path test.pkl` 
+
+`python main.py --true_align data/arenas/arenas800-3/arenas_edges-mapping-permutation.txt --combined_graph coarsening/test.pkl --embmethod xnetMF --alignmethod REGAL --refinemethod RefiNA  --coarsen`
diff --git a/coarsening/graph.py b/coarsening/graph.py
@@ -0,0 +1,40 @@
+import numpy as np
+
+class Graph(object):
+    ''' Note: adj_list shows each edge twice. So edge_num is really two times of edge number for undirected graph.'''
+
+    def __init__(self, node_num, edge_num):
+        self.node_num = node_num  # n
+        self.edge_num = edge_num  # m
+        self.adj_list = np.zeros(edge_num, dtype=np.int32) - 1  # a big array for all the neighbors.
+        self.adj_idx = np.zeros(node_num + 1,
+                                dtype=np.int32)  # idx of the beginning neighbors in the adj_list. Pad one additional element at the end with value equal to the edge_num, i.e., self.adj_idx[-1] = edge_num
+        self.adj_wgt = np.zeros(edge_num,
+                                dtype=np.float32)  # same dimension as adj_list, wgt on the edge. CAN be float numbers.
+        self.node_wgt = np.zeros(node_num, dtype=np.float32)
+        self.cmap = np.zeros(node_num, dtype=np.int32) - 1  # mapped to coarser graph
+
+        # weighted degree: the sum of the adjacency weight of each vertex, including self-loop.
+        self.degree = np.zeros(node_num, dtype=np.float32)
+        self.A = None
+        self.C = None  # Matching Matrix
+
+        self.coarser = None
+        self.finer = None
+
+    def resize_adj(self, edge_num):
+        '''Resize the adjacency list/wgts based on the number of edges.'''
+        self.adj_list = np.resize(self.adj_list, edge_num)
+        self.adj_wgt = np.resize(self.adj_wgt, edge_num)
+
+    def get_neighs(self, idx):
+        '''obtain the list of neigbors given a node.'''
+        istart = self.adj_idx[idx]
+        iend = self.adj_idx[idx + 1]
+        return self.adj_list[istart:iend]
+
+    def get_neigh_edge_wgts(self, idx):
+        '''obtain the weights of neighbors given a node.'''
+        istart = self.adj_idx[idx]
+        iend = self.adj_idx[idx + 1]
+        return self.adj_wgt[istart:iend]
diff --git a/coarsening/nhem.py b/coarsening/nhem.py
@@ -0,0 +1,129 @@
+from collections import defaultdict
+from graph import Graph
+import numpy as np
+from utils import cmap2C
+from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+from utils import from_alignment_edgelist_to_graph, create_coarse_graph
+import importlib
+import logging
+import numpy as np
+import pdb
+import os
+import time
+import pickle
+
+#### Credit: codes partially borrowed from MILE: https://github.com/jiongqian/MILE
+#### Also cited in paper
+
+def parse_args():
+    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter,
+                            conflict_handler='resolve')
+    parser.add_argument('--data', required=True, help='Input graph file')
+    parser.add_argument('--coarsen-level', default=2, type=int, help='MAX number of levels of coarsening.')
+    parser.add_argument('--workers', default=4, type=int, help='Number of workers.')
+    parser.add_argument('--output-path', required=True, help='Path to save the output pickle file')
+    args = parser.parse_args()
+    return args
+
+def normalized_adj_wgt(graph):
+    adj_wgt = graph.adj_wgt
+    adj_idx = graph.adj_idx
+    norm_wgt = np.zeros(adj_wgt.shape, dtype=np.float32)
+    degree = graph.degree
+    for i in range(graph.node_num):
+        for j in range(adj_idx[i], adj_idx[i + 1]):
+            neigh = graph.adj_list[j]
+            norm_wgt[j] = adj_wgt[neigh] / np.sqrt(degree[i] * degree[neigh])
+    return norm_wgt
+
+def normalized_heavy_edge_matching(args, graph):
+    '''Generate matchings using the hybrid method. It changes the cmap in graph object, 
+    return groups array and coarse_graph_size.'''
+    node_num = graph.node_num
+    adj_list = graph.adj_list  # big array for neighbors.
+    adj_idx = graph.adj_idx  # beginning idx of neighbors.
+    adj_wgt = graph.adj_wgt  # weight on edge
+    node_wgt = graph.node_wgt  # weight on node
+    cmap = graph.cmap
+    norm_adj_wgt = normalized_adj_wgt(graph)
+    coarsen_to = max(1, graph.node_num // (2 ** args.coarsen_level))  # rough estimation.
+    max_node_wgt = int((5.0 * graph.node_num) / coarsen_to)
+
+    groups = []  # a list of groups, each group corresponding to one coarse node.
+    matched = [False] * node_num
+
+    degree = [adj_idx[i + 1] - adj_idx[i] for i in range(0, node_num)]
+    sorted_idx = np.argsort(degree)
+    for idx in sorted_idx:
+        if matched[idx]:
+            continue
+        max_idx = idx
+        max_wgt = -1
+        for j in range(adj_idx[idx], adj_idx[idx + 1]):
+            neigh = adj_list[j]
+            if neigh == idx:  # KEY: exclude self-loop. Otherwise, mostly matching with itself.
+                continue
+            curr_wgt = norm_adj_wgt[j]
+            if ((not matched[neigh]) and max_wgt < curr_wgt and node_wgt[idx] + node_wgt[neigh] <= max_node_wgt):
+                max_idx = neigh
+                max_wgt = curr_wgt
+        # it might happen that max_idx is idx, which means cannot find a match for the node. 
+        matched[idx] = matched[max_idx] = True
+        if idx == max_idx:
+            groups.append([idx])
+        else:
+            groups.append([idx, max_idx])
+    coarse_graph_size = 0
+    for idx in range(len(groups)):
+        for ele in groups[idx]:
+            cmap[ele] = coarse_graph_size
+        coarse_graph_size += 1
+    return (groups, coarse_graph_size)
+
+def multilevel_embed(args, graph):
+    '''This method defines the multilevel embedding method.'''
+    start = time.time()
+
+    # Step-1: Graph Coarsening.
+    original_graph = graph
+    coarsen_level = args.coarsen_level
+    graphs = []
+    graphs.append(graph)
+    for i in range(coarsen_level):
+        match, coarse_graph_size = normalized_heavy_edge_matching(args, graph)
+        coarse_graph = create_coarse_graph(graph, match, coarse_graph_size)
+        graph = coarse_graph
+        graphs.append(graph)
+
+    return graphs
+
+if __name__ == "__main__":
+    before_emb = time.time()
+    seed = 123
+    np.random.seed(seed)
+    args = parse_args()
+
+    input_graph_path = args.data
+    graphs, mapping = from_alignment_edgelist_to_graph(input_graph_path)
+    labels = None
+    embed_As = []
+    embed_Cs = []
+    max_degree = 0
+    for graph in graphs:
+        As = []
+        Cs = []
+        # Generate multilevel projects
+        graph_embeds = multilevel_embed(args, graph)
+        for i in range(args.coarsen_level):
+            As.append(graph_embeds[i].A)
+            Cs.append(graph_embeds[i].C)
+        embed_As.append(As)
+        embed_Cs.append(Cs)
+    to_save = {'A_list': embed_As, 'matches': embed_Cs, 'labels': labels}
+    pickle_out = open(args.output_path, "wb")
+    pickle.dump(to_save, pickle_out)
+    pickle_out.close()
+    after_emb = time.time()
+    total_time = after_emb - before_emb
+    print(("time for embed (in seconds): %f" % total_time))
+    print("MILE completed")
diff --git a/coarsening/test.pkl b/coarsening/test.pkl