diff --git a/dynetlsm/datasets/samples_generator.py b/dynetlsm/datasets/samples_generator.py index f2e9e3c..17a7945 100644 --- a/dynetlsm/datasets/samples_generator.py +++ b/dynetlsm/datasets/samples_generator.py @@ -17,6 +17,7 @@ __all__ = ['network_from_dynamic_latent_space', 'simple_splitting_dynamic_network', + 'synthetic_static_community_dynamic_network', 'synthetic_dynamic_network'] @@ -205,6 +206,83 @@ def simple_splitting_dynamic_network(n_nodes=120, n_time_steps=9, return Y, z +def synthetic_static_community_dynamic_network( + n_nodes=100, n_time_steps=5, n_groups=6, + intercept=1.0, lmbda=0.8, sticky_const=20., + sigma_shape=6, sigma_scale=20, + random_state=42): + rng = check_random_state(random_state) + + # group locations + mus = np.array([[-3, 0], + [3, 0], + [-1.5, 0], + [1.5, 0], + [0, 2.0], + [0, -2.0]]) + + if n_groups > 6: + raise ValueError("Only a maximum of six groups allowed for now.") + + # group spread + sigmas = np.sqrt(1. / rng.gamma(shape=sigma_shape, scale=sigma_scale, + size=n_groups)) + + # sample initial distribution + w0 = rng.dirichlet(np.repeat(10, n_groups)) # E[p] = 1 / n_groups + + # set-up transition distribution + with np.errstate(divide='ignore'): + wt = 1. / pairwise_distances(mus) + + # only took necessary groups + wt = wt[:n_groups][:, :n_groups] + diag_indices = np.diag_indices_from(wt) + wt[diag_indices] = 0 + wt[diag_indices] = sticky_const * np.max(wt, axis=1) + wt /= wt.sum(axis=1).reshape(-1, 1) + + # run data generating process + X, z = [], [] + + # t = 0 + z0 = rng.choice(np.arange(n_groups), p=w0, size=n_nodes) + X0 = np.zeros((n_nodes, 2), dtype=np.float64) + for group_id in range(n_groups): + group_count = np.sum(z0 == group_id) + X0[z0 == group_id, :] = (sigmas[group_id] * rng.randn(group_count, 2) + + mus[group_id]) + X.append(X0) + z.append(z0) + + for t in range(1, n_time_steps): + zt = np.zeros(n_nodes, dtype=np.int) + for group_id in range(n_groups): + group_mask = z[t - 1] == group_id + zt[group_mask] = rng.choice(np.arange(n_groups), p=wt[group_id, :], + size=np.sum(group_mask)) + + Xt = np.zeros((n_nodes, 2), dtype=np.float64) + for group_id in range(n_groups): + group_mask = zt == group_id + group_count = np.sum(group_mask) + Xt[group_mask, :] = ( + sigmas[group_id] * rng.randn(group_count, 2) + ( + lmbda * mus[group_id] + (1 - lmbda) * X[t-1][group_mask, :]) + ) + + X.append(Xt) + z.append(zt) + + X = np.stack(X, axis=0) + z = np.vstack(z) + + Y, _ = network_from_dynamic_latent_space(X, intercept=intercept, + random_state=rng) + + return Y, X, z, intercept + + def synthetic_dynamic_network(n_nodes=120, n_time_steps=9, intercept=1.0, lmbda=0.8, sticky_const=20., sigma_shape=6, sigma_scale=20, is_directed=False, diff --git a/examples/GoT.py b/examples/GoT.py index 015b1e4..c6ae323 100644 --- a/examples/GoT.py +++ b/examples/GoT.py @@ -1,3 +1,9 @@ +""" +Runs the analysis of the GoT character interactions network found in the +paper 'A Bayesian nonparametric latent space approach to modeling evolving +communities in dynamic networks' by Joshua Loyal and Yuguo Chen +""" + from dynetlsm import DynamicNetworkHDPLPCM from dynetlsm.datasets import load_got from dynetlsm.plots import ( @@ -7,10 +13,6 @@ ) -# Runs the analysis of the GoT character interactions network found in the -# paper "A Bayesian nonparametric latent space approach to modeling evolving -# "communities in dynamic networks" by Joshua Loyal and Yuguo Chen - # Load GoT character interaction networks Y, names = load_got(seasons=[1,2,3,4], weight_min=10) diff --git a/examples/homogeneous_simulation.py b/examples/homogeneous_simulation.py new file mode 100644 index 0000000..8dde147 --- /dev/null +++ b/examples/homogeneous_simulation.py @@ -0,0 +1,162 @@ +import glob +import os + +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns + +from sklearn.utils import check_random_state +from sklearn.metrics import adjusted_rand_score + +from dynetlsm import DynamicNetworkHDPLPCM +from dynetlsm.datasets import synthetic_static_community_dynamic_network +from dynetlsm.model_selection.approx_bic import calculate_cluster_counts +from dynetlsm.metrics import variation_of_information +from dynetlsm.network_statistics import density, modularity + + +def counts_per_time_step(z): + n_time_steps = z.shape[0] + group_counts = np.zeros(n_time_steps, dtype=np.int) + for t in range(n_time_steps): + group_counts[t] = np.unique(z[t]).shape[0] + + return group_counts + +def posterior_per_time_step(model): + n_time_steps = model.Y_fit_.shape[0] + probas = np.zeros((n_time_steps, model.n_components + 1)) + for t in range(n_time_steps): + freq = model.posterior_group_counts_[t] + index = model.posterior_group_ids_[t] + probas[t, index] = freq / freq.sum() + + return probas + + +def benchmark_single(n_iter=10000, burn=5000, tune=1000, + outfile_name='benchmark', + random_state=None): + random_state = check_random_state(random_state) + + # generate simulated networks + Y, X, z, intercept = synthetic_static_community_dynamic_network( + n_time_steps=6, n_nodes=120, random_state=random_state) + + # fit HDP-LPCM + model = DynamicNetworkHDPLPCM(n_iter=n_iter, + burn=burn, + tune=tune, + tune_interval=1000, + is_directed=False, + selection_type='vi', + n_components=10, + random_state=random_state).fit(Y) + + # MAP: number of clusters per time point + map_counts = counts_per_time_step(model.z_) + + # Posterior group count probabilities + probas = posterior_per_time_step(model) + + # create dataframe of results + results = pd.DataFrame(probas) + results['map_counts'] = map_counts + + # goodness-of-fit metrics for MAP + results['insample_auc'] = model.auc_ + results['vi'] = variation_of_information(z.ravel(), model.z_.ravel()) + + # time average VI + vi = 0. + for t in range(Y.shape[0]): + vi += variation_of_information(z[t], model.z_[t]) + results['vi_avg'] = vi / Y.shape[0] + + results['rand_index'] = adjusted_rand_score(z.ravel(), model.z_.ravel()) + + # time average rand + adj_rand = 0. + for t in range(Y.shape[0]): + adj_rand += adjusted_rand_score(z[t], model.z_[t]) + results['rand_avg'] = adj_rand / Y.shape[0] + + # info about simulated networks + results['modularity'] = modularity(Y, z) + results['density'] = density(Y) + + results.to_csv(outfile_name, index=False) + + +# NOTE: This is meant to be run in parallel on a computer cluster! +n_reps = 50 +out_dir = 'results' + +# create a directory to store the results +if not os.path.exists('results'): + os.mkdir(out_dir) + +for i in range(n_reps): + benchmark_single(n_iter=35000, burn=10000, tune=5000, random_state=i, + outfile_name=os.path.join( + out_dir, 'benchmark_{}.csv'.format(i))) + +# calculate median metric values +n_time_steps = 6 +n_groups = 10 + +n_files = len(glob.glob('results/*')) +stat_names = ['insample_auc', 'vi_avg', 'rand_avg'] +data = np.zeros((n_files, len(stat_names))) +for i, file_name in enumerate(glob.glob('results/*')): + df = pd.read_csv(file_name) + data[i] = df.loc[0, stat_names].values + +data = pd.DataFrame(data, columns=stat_names) +print('Median Metrics:') +print(data.median(axis=0)) +print('Metrics SD:') +print(data.std(axis=0)) + +# plot posterior boxplots +data = {'probas': [], 'cluster_number': [], 't': []} +for file_name in glob.glob('results/*'): + df = pd.read_csv(file_name) + for t in range(n_time_steps): + for i in range(1, n_groups): + data['probas'].append(df.iloc[t, i]) + data['cluster_number'].append(i) + data['t'].append(t + 1) + +data = pd.DataFrame(data) + +plt.rc('font', family='sans-serif', size=16) +g = sns.catplot(x='cluster_number', y='probas', col='t', + col_wrap=3, kind='box', data=data) + +for ax in g.axes: + ax.set_ylabel('posterior probability') + ax.set_xlabel('# of groups') + +g.fig.tight_layout() + +plt.savefig('cluster_posterior.png', dpi=300) + +# clear figure +plt.clf() + +# plot selected number of groups for each simulation +data = np.zeros((n_time_steps, n_groups), dtype=np.int) +for sim_id, file_name in enumerate(glob.glob('results/*')): + df = pd.read_csv(file_name) + for t in range(n_time_steps): + data[t, df.iloc[t, n_groups + 1] - 1] +=1 + +data = pd.DataFrame(data, columns=range(1, n_groups + 1), index=range(1, n_time_steps + 1)) +mask = data.values == 0 + +g = sns.heatmap(data, annot=True, cmap="Blues", cbar=False, mask=mask) +g.set_xlabel('# of groups') +g.set_ylabel('t') +plt.savefig('num_clusters.png', dpi=300) diff --git a/examples/military_alliances.py b/examples/military_alliances.py index 69e406d..9a1b8f6 100644 --- a/examples/military_alliances.py +++ b/examples/military_alliances.py @@ -1,3 +1,9 @@ +""" +Runs the analysis of the military alliances network found in the +paper 'A Bayesian nonparametric latent space approach to modeling evolving +communities in dynamic networks' by Joshua Loyal and Yuguo Chen +""" + from dynetlsm import DynamicNetworkHDPLPCM from dynetlsm.datasets import load_alliances from dynetlsm.plots import ( @@ -7,10 +13,6 @@ ) -# Runs the analysis of the military alliances network found in the -# paper "A Bayesian nonparametric latent space approach to modeling evolving -# "communities in dynamic networks" by Joshua Loyal and Yuguo Chen - # Load military alliances networks Y, names = load_alliances() diff --git a/examples/sampson_monks.py b/examples/sampson_monks.py index 2e2958a..8976ede 100644 --- a/examples/sampson_monks.py +++ b/examples/sampson_monks.py @@ -1,3 +1,9 @@ +""" +Runs the analysis of the Sampson's monastery network found in the +paper 'A Bayesian nonparametric latent space approach to modeling evolving +communities in dynamic networks' by Joshua Loyal and Yuguo Chen +""" + from dynetlsm import DynamicNetworkHDPLPCM from dynetlsm.datasets import load_monks from dynetlsm.plots import ( @@ -8,10 +14,6 @@ ) -# Runs the analysis of the Sampson's monastery network found in the -# paper "A Bayesian nonparametric latent space approach to modeling evolving -# "communities in dynamic networks" by Joshua Loyal and Yuguo Chen - # Load Sampson's monastery network Y, labels, names = load_monks(dynamic=True, is_directed=False) diff --git a/setup.py b/setup.py index abbaf96..90f9e32 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ def get_sources(): for name in os.listdir(src_path): path = os.path.join(src_path, name) if os.path.isfile(path) and path.endswith(".c"): - files.append(path) + files.append(os.path.relpath(path)) return files @@ -119,7 +119,7 @@ def make_extension(ext_name, macros=[]): include_dirs = [get_include] + include_dirs return Extension( mod_name, - sources=[ext_path] + get_sources(), + sources=[os.path.relpath(ext_path)] + get_sources(), include_dirs=include_dirs, extra_compile_args=["-O3", "-Wall", "-fPIC"], define_macros=macros)