dataset.py

import numpy as np
import networkx as nx
import pickle as pkl
import scipy.sparse as sp
import sys
import json
import os.path as osp
from deeprobust.graph.data import Dataset


def sample_mask(idx, l):
    """
    Create mask.
    """
    mask = np.zeros(l)
    mask[idx] = 1
    return np.array(mask, dtype=np.bool)


def parse_index_file(filename):
    """
    Parse index file.
    """
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index


def get_prognn_splits(json_file):
    """Get target nodes incides, which is the nodes with degree > 10 in the test set."""
    with open(json_file, 'r') as f:
        idx = json.loads(f.read())
    return np.array(idx['idx_train']), \
           np.array(idx['idx_val']), np.array(idx['idx_test'])


def load_datasp(dataset_str):
    """
    This code adapted from the Tensorflow implementation of GCN by Thomas Kipf (https://github.com/tkipf/gcn).

    Loads input data from gcn/data directory

    ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
        (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
    ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
    ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
    ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
    ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
        object;
    ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.

    All objects above must be saved using python pickle module.

    :param dataset_str: Dataset name
    :return: All data input files loaded (as well the training/test data).
    """

    root = './Data_gcn/'
    if dataset_str == 'polblogs':
        data = Dataset(root=root, name=dataset_str, setting='gcn', seed=10)
        adj, features, labels = data.adj, data.features, data.labels
        json_file = osp.join(root, '{}_gcn_splits.json'.format(dataset_str))
        idx_train, idx_val, idx_test = get_prognn_splits(json_file)
    else:
        names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
        objects = []
        for i in range(len(names)):
            with open(root + "ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
                if sys.version_info > (3, 0):
                    objects.append(pkl.load(f, encoding='latin1'))
                else:
                    objects.append(pkl.load(f))

        x, y, tx, ty, allx, ally, graph = tuple(objects)
        test_idx_reorder = parse_index_file(root + "ind.{}.test.index".format(dataset_str))
        test_idx_range = np.sort(test_idx_reorder)

        if dataset_str == 'citeseer':
            # Fix citeseer dataset (there are some isolated nodes in the graph)
            # Find isolated nodes, add them as zero-vecs into the right position
            test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
            tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
            tx_extended[test_idx_range-min(test_idx_range), :] = tx
            tx = tx_extended
            ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
            ty_extended[test_idx_range-min(test_idx_range), :] = ty
            ty = ty_extended

        features = sp.vstack((allx, tx)).tolil()
        features[test_idx_reorder, :] = features[test_idx_range, :]
        features = features.tocsr()

        adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

        labels = np.vstack((ally, ty))
        labels[test_idx_reorder, :] = labels[test_idx_range, :]
        labels = labels.argmax(axis=1)

        idx_test = test_idx_range.tolist()
        idx_test = np.array(idx_test)
        idx_train = np.array(range(len(y)))
        idx_val = np.array(range(len(y), len(y)+500))

    return adj, features, labels, idx_train, idx_test, idx_val