alexbarghi-nv · alexbarghi-nv · Jan 3, 2024 · Jan 3, 2024 · Jan 4, 2024 · Jan 5, 2024
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
@@ -133,3 +133,43 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: nx-cugraph
+  wheel-build-cugraph-dgl:
+    needs: wheel-publish-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cugraph-dgl.sh
+  wheel-publish-cugraph-dgl:
+    needs: wheel-build-cugraph-dgl
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cugraph-dgl
+  wheel-build-cugraph-pyg:
+    needs: wheel-publish-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cugraph-pyg.sh
+  wheel-publish-cugraph-pyg:
+    needs: wheel-build-cugraph-pyg
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cugraph-pyg
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -25,6 +25,10 @@ jobs:
       - wheel-tests-cugraph
       - wheel-build-nx-cugraph
       - wheel-tests-nx-cugraph
+      - wheel-build-cugraph-dgl
+      - wheel-tests-cugraph-dgl
+      - wheel-build-cugraph-pyg
+      - wheel-tests-cugraph-pyg
       - devcontainer
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
@@ -127,6 +131,36 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_nx-cugraph.sh
+  wheel-build-cugraph-dgl:
+    needs: wheel-tests-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_cugraph-dgl.sh
+  wheel-tests-cugraph-dgl:
+    needs: wheel-build-cugraph-dgl
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_cugraph-dgl.sh
+      matrix_filter: map(select(.ARCH == "amd64"))
+  wheel-build-cugraph-pyg:
+    needs: wheel-tests-cugraph
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      script: ci/build_wheel_cugraph-pyg.sh
+  wheel-tests-cugraph-pyg:
+    needs: wheel-build-cugraph-pyg
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_cugraph-pyg.sh
+      matrix_filter: map(select(.ARCH == "amd64" and .CUDA_VER == "11.8.0"))
   devcontainer:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -57,3 +57,21 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/test_wheel_nx-cugraph.sh
+  wheel-tests-cugraph-dgl:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_cugraph-dgl.sh
+  wheel-tests-cugraph-pyg:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/[email protected]
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      script: ci/test_wheel_cugraph-pyg.sh
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -52,7 +52,7 @@ repos:
         pass_filenames: false
         additional_dependencies: [gitpython]
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.5.1
+    rev: v1.8.0
     hooks:
         - id: rapids-dependency-file-generator
           args: ["--clean"]
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/README.md b/benchmarks/cugraph/standalone/bulk_sampling/README.md
@@ -143,7 +143,7 @@ You will need to modify the bash scripts to run appopriately for your environmen
 desired training workflow.  The standard sbatch arguments are at the top of the script, such as
 job name, queue, etc.  These will need to be modified for your SLURM cluster.
 
-Next are arguments for the container image (which is currently set to the current DLFW image),
+Next are arguments for the container image (required),
 and directories where the data and outputs are stored.  The directories default to subdirectories
 of the current working directory.  But if there is a high-throughput storage system available,
 using that storage for the samples and datasets is highly recommended.

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -16,7 +16,7 @@
 os.environ["RAPIDS_NO_INITIALIZE"] = "1"
 os.environ["CUDF_SPILL"] = "1"
 os.environ["LIBCUDF_CUFILE_POLICY"] = "KVIKIO"
-os.environ["KVIKIO_NTHREADS"] = "64"
+os.environ["KVIKIO_NTHREADS"] = "8"
 
 import argparse
 import json
@@ -123,6 +123,13 @@ def parse_args():
         required=True,
     )
 
+    parser.add_argument(
+        "--use_wholegraph",
+        action="store_true",
+        help="Whether to use WholeGraph feature storage",
+        required=False,
+    )
+
     parser.add_argument(
         "--model",
         type=str,
@@ -162,6 +169,13 @@ def parse_args():
         required=False,
     )
 
+    parser.add_argument(
+        "--skip_download",
+        action="store_true",
+        help="Whether to skip downloading",
+        required=False,
+    )
+
     return parser.parse_args()
 
 
@@ -186,16 +200,37 @@ def main(args):
 
     world_size = int(os.environ["SLURM_JOB_NUM_NODES"]) * args.gpus_per_node
 
+    if args.use_wholegraph:
+        # TODO support DGL too
+        # TODO support WG without cuGraph
+        if args.framework not in ["cuGraphPyG"]:
+            raise ValueError("WG feature store only supported with cuGraph backends")
+        from pylibwholegraph.torch.initialize import (
+            get_global_communicator,
+            get_local_node_communicator,
+        )
+
+        logger.info("initializing WG comms...")
+        wm_comm = get_global_communicator()
+        get_local_node_communicator()
+
+        wm_comm = wm_comm.wmb_comm
+        logger.info(f"rank {global_rank} successfully initialized WG comms")
+        wm_comm.barrier()
+
     dataset = OGBNPapers100MDataset(
         replication_factor=args.replication_factor,
         dataset_dir=args.dataset_dir,
         train_split=args.train_split,
         val_split=args.val_split,
         load_edge_index=(args.framework == "PyG"),
+        backend="wholegraph" if args.use_wholegraph else "torch",
     )
 
-    if global_rank == 0:
+    # Note: this does not generate WG files
+    if global_rank == 0 and not args.skip_download:
         dataset.download()
+
     dist.barrier()
 
     fanout = [int(f) for f in args.fanout.split("_")]
@@ -234,6 +269,7 @@ def main(args):
             replace=False,
             num_neighbors=fanout,
             batch_size=args.batch_size,
+            backend="wholegraph" if args.use_wholegraph else "torch",
         )
     else:
         raise ValueError("unsupported framework")

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/datasets/ogbn_papers100M.py b/benchmarks/cugraph/standalone/bulk_sampling/datasets/ogbn_papers100M.py
@@ -24,6 +24,10 @@
 import os
 import json
 
+from cugraph.utilities.utils import import_optional
+
+wgth = import_optional("pylibwholegraph.torch")
+
 
 class OGBNPapers100MDataset(Dataset):
     def __init__(
@@ -34,6 +38,7 @@ def __init__(
         train_split=0.8,
         val_split=0.5,
         load_edge_index=True,
+        backend="torch",
     ):
         self.__replication_factor = replication_factor
         self.__disk_x = None
@@ -43,6 +48,7 @@ def __init__(
         self.__train_split = train_split
         self.__val_split = val_split
         self.__load_edge_index = load_edge_index
+        self.__backend = backend
 
     def download(self):
         import logging
@@ -152,6 +158,27 @@ def download(self):
             )
             ldf.to_parquet(node_label_file_path)
 
+        # WholeGraph
+        wg_bin_file_path = os.path.join(dataset_path, "wgb", "paper")
+        if self.__replication_factor == 1:
+            wg_bin_rep_path = os.path.join(wg_bin_file_path, "node_feat.d")
+        else:
+            wg_bin_rep_path = os.path.join(
+                wg_bin_file_path, f"node_feat_{self.__replication_factor}x.d"
+            )
+
+        if not os.path.exists(wg_bin_rep_path):
+            os.makedirs(wg_bin_rep_path)
+            if dataset is None:
+                from ogb.nodeproppred import NodePropPredDataset
+
+                dataset = NodePropPredDataset(
+                    name="ogbn-papers100M", root=self.__dataset_dir
+                )
+            node_feat = dataset[0][0]["node_feat"]
+            for k in range(self.__replication_factor):
+                node_feat.tofile(os.path.join(wg_bin_rep_path, f"{k:04d}.bin"))
+
     @property
     def edge_index_dict(
         self,
@@ -224,21 +251,52 @@ def edge_index_dict(
 
     @property
     def x_dict(self) -> Dict[str, torch.Tensor]:
+        if self.__disk_x is None:
+            if self.__backend == "wholegraph":
+                self.__load_x_wg()
+            else:
+                self.__load_x_torch()
+
+        return self.__disk_x
+
+    def __load_x_torch(self) -> None:
         node_type_path = os.path.join(
             self.__dataset_dir, "ogbn_papers100M", "npy", "paper"
         )
+        if self.__replication_factor == 1:
+            full_path = os.path.join(node_type_path, "node_feat.npy")
+        else:
+            full_path = os.path.join(
+                node_type_path, f"node_feat_{self.__replication_factor}x.npy"
+            )
 
-        if self.__disk_x is None:
-            if self.__replication_factor == 1:
-                full_path = os.path.join(node_type_path, "node_feat.npy")
-            else:
-                full_path = os.path.join(
-                    node_type_path, f"node_feat_{self.__replication_factor}x.npy"
-                )
+        self.__disk_x = {"paper": torch.as_tensor(np.load(full_path, mmap_mode="r"))}
 
-            self.__disk_x = {"paper": np.load(full_path, mmap_mode="r")}
+    def __load_x_wg(self) -> None:
+        node_type_path = os.path.join(
+            self.__dataset_dir, "ogbn_papers100M", "wgb", "paper"
+        )
+        if self.__replication_factor == 1:
+            full_path = os.path.join(node_type_path, "node_feat.d")
+        else:
+            full_path = os.path.join(
+                node_type_path, f"node_feat_{self.__replication_factor}x.d"
+            )
 
-        return self.__disk_x
+        file_list = [os.path.join(full_path, f) for f in os.listdir(full_path)]
+
+        x = wgth.create_embedding_from_filelist(
+            wgth.get_global_communicator(),
+            "chunked",  # TODO support other options
+            "cpu",  # TODO support GPU
+            file_list,
+            torch.float32,
+            128,
+        )
+
+        print("created x wg embedding", x)
+
+        self.__disk_x = {"paper": x}
 
     @property
     def y_dict(self) -> Dict[str, torch.Tensor]:

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh b/benchmarks/cugraph/standalone/bulk_sampling/run_sampling.sh
@@ -36,8 +36,6 @@ export CUDF_SPILL=1
 export LIBCUDF_CUFILE_POLICY="OFF"
 export GPUS_PER_NODE=8
 
-PATCH_CUGRAPH=1
-
 export SCHEDULER_FILE=$SCHEDULER_FILE
 export LOGS_DIR=$LOGS_DIR
 
@@ -60,17 +58,6 @@ else
     ${MG_UTILS_DIR}/run-dask-process.sh workers &
 fi
 
-if [[ $PATCH_CUGRAPH == 1 ]]; then
-    mkdir /opt/cugraph-patch
-    git clone https://github.com/alexbarghi-nv/cugraph -b dlfw-patch-24.01 /opt/cugraph-patch
-
-    rm /opt/rapids/cugraph/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
-    cp /opt/cugraph-patch/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py /opt/rapids/cugraph/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py
-    rm /usr/local/lib/python3.10/dist-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py
-    cp /opt/cugraph-patch/python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py /usr/local/lib/python3.10/dist-packages/cugraph/structure/graph_implementation/simpleDistributedGraph.py
-
-fi
-
 echo "properly waiting for workers to connect"
 NUM_GPUS=$(python -c "import os; print(int(os.environ['SLURM_JOB_NUM_NODES'])*int(os.environ['GPUS_PER_NODE']))")
 handleTimeout 120 python ${MG_UTILS_DIR}/wait_for_workers.py \

diff --git a/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh b/benchmarks/cugraph/standalone/bulk_sampling/run_train_job.sh
@@ -18,7 +18,7 @@
 #SBATCH -N 1
 #SBATCH -t 00:25:00 
 
-CONTAINER_IMAGE="/lustre/fsw/rapids/abarghi/dlfw_patched.squash"
+CONTAINER_IMAGE=${CONTAINER_IMAGE:="please_specify_container"}
 SCRIPTS_DIR=$(pwd)
 LOGS_DIR=${LOGS_DIR:=$(pwd)"/logs"}
 SAMPLES_DIR=${SAMPLES_DIR:=$(pwd)/samples}