diff --git a/.travis.yml b/.travis.yml
index f4664f02..18e1e916 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,35 +1,90 @@
+language: shell
+
+os:
+  - linux
+  - osx
+  - windows
+
+env:
+  global:
+    - CUDA_HOME=/usr/local/cuda
+  jobs:
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cpu
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cpu
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cpu
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
+    - TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101
+
 jobs:
-  include:
-    - os: linux
-      language: python
-      python: 3.7
-      addons:
-        apt:
-          sources:
-            - ubuntu-toolchain-r-test
-          packages:
-            - gcc-5
-            - g++-5
-      env:
-        - CC=gcc-5
-        - CXX=g++-5
+  exclude:  # Exclude *all* macOS CUDA jobs and Windows CUDA 9.2/10.0 jobs.
+    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
+    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
+    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu101
+    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
+    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
+    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu101
+    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
+    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
+    - os: osx
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu101
+    - os: windows
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu92
+    - os: windows
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.8 IDX=cu100
+    - os: windows
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu92
+    - os: windows
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.7 IDX=cu100
+    - os: windows
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu92
+    - os: windows
+      env: TORCH_VERSION=1.4.0 PYTHON_VERSION=3.6 IDX=cu100
+
+
 install:
-  - pip install numpy
-  - pip install --pre torch torchvision -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
-  - pip install pycodestyle
-  - pip install flake8
-  - pip install codecov
-  - pip install sphinx
-  - pip install sphinx_rtd_theme
-  - pip install sphinx-autodoc-typehints
+  - source script/cuda.sh
+  - source script/conda.sh
+  - conda create --yes -n test python="${PYTHON_VERSION}"
+  - source activate test
+  - conda install pytorch=${TORCH_VERSION} ${TOOLKIT} -c pytorch --yes
+  - source script/torch.sh
+  - pip install flake8 codecov
+  - python setup.py install
+
 script:
-  - python -c "import torch; print(torch.__version__)"
-  - pycodestyle .
   - flake8 .
-  - python setup.py install
   - python setup.py test
-  - cd docs && make clean && make html && make doctest && cd ..
 after_success:
+  - python setup.py bdist_wheel --dist-dir=dist/torch-${TORCH_VERSION}
+  - python script/rename_wheel.py ${IDX}
   - codecov
+deploy:
+  provider: s3
+  region: eu-central-1
+  edge: true
+  access_key_id: AKIAJB7S6NJ5OM5MAAGA
+  secret_access_key: ${S3_SECRET_ACCESS_KEY}
+  bucket: pytorch-scatter
+  local_dir: dist/torch-${TORCH_VERSION}
+  upload_dir: whl/torch-${TORCH_VERSION}
+  acl: public_read
+  on:
+    repo: rusty1s/pytorch_scatter
+    tags: true
 notifications:
   email: false
diff --git a/MANIFEST.in b/MANIFEST.in
index 577bbb73..fe38bb7e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,3 +1,5 @@
+include README.md
 include LICENSE
 
+recursive-exclude test *
 recursive-include csrc *
diff --git a/README.md b/README.md
index a05ac34b..9ebdd9a1 100644
--- a/README.md
+++ b/README.md
@@ -36,7 +36,26 @@ In addition, we provide the following **composite functions** which make use of
 
 All included operations are broadcastable, work on varying data types, are implemented both for CPU and GPU with corresponding backward implementations, and are fully traceable.
 
-## Installation
+# Installation
+
+### Binaries
+
+We provide pip wheels for all major OS/PyTorch/CUDA combinations, see [here](http://pytorch-scatter.s3-website.eu-central-1.amazonaws.com/whl).
+To install from binaries, simply run
+
+```
+pip install torch-scatter==latest+${CUDA} -f http://pytorch-scatter.s3-website.eu-central-1.amazonaws.com/whl/torch-1.4.0.html --trusted-host pytorch-scatter.s3-website.eu-central-1.amazonaws.com
+```
+
+where `${CUDA}` should be replaced by either `cpu`, `cu92`, `cu100` or `cu101` depending on your PyTorch installation.
+
+|             | `cpu` | `cu92` | `cu100` | `cu101` |
+|-------------|-------|--------|---------|---------|
+| **Linux**   | ✅    | ✅     | ✅      | ✅      |
+| **Windows** | ✅    | ❌     | ❌      | ✅      |
+| **macOS**   | ✅    | ❌     | ❌      | ❌      |
+
+### From source
 
 Ensure that at least PyTorch 1.4.0 is installed and verify that `cuda/bin` and `cuda/include` are in your `$PATH` and `$CPATH` respectively, *e.g.*:
 
@@ -51,19 +70,24 @@ $ echo $CPATH
 >>> /usr/local/cuda/include:...
 ```
 
-When running in a docker container without nvidia driver, PyTorch needs to evaluate the compute capabilities and may fail. In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`
+Then run
 
 ```
-export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
+pip install torch-scatter
 ```
 
-### Windows
+or
+
+```
+python setup.py install
+```
 
-If you are installing this on Windows specifically, **you will need to point the setup to your Visual Studio installation** for some neccessary libraries and header files.
-To do this, add the include and library paths of your installation to the path lists in setup.py as described in the respective comments in the code.
+When running in a docker container without nvidia driver, PyTorch needs to evaluate the compute capabilities and may fail.
+In this case, ensure that the compute capabilities are set via `TORCH_CUDA_ARCH_LIST`, *e.g.*:
 
-If you are running into any installation problems, please create an [issue](https://github.com/rusty1s/pytorch_scatter/issues).
-Be sure to import `torch` first before using this package to resolve symbols the dynamic linker must see.
+```
+export TORCH_CUDA_ARCH_LIST = "6.0 6.1 7.2+PTX 7.5+PTX"
+```
 
 ## Example
 
diff --git a/csrc/scatter.cpp b/csrc/scatter.cpp
index d1fd1a0d..6c9fab3b 100644
--- a/csrc/scatter.cpp
+++ b/csrc/scatter.cpp
@@ -1,3 +1,4 @@
+#include <Python.h>
 #include <torch/script.h>
 
 #include "cpu/scatter_cpu.h"
@@ -7,6 +8,10 @@
 #include "cuda/scatter_cuda.h"
 #endif
 
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__scatter(void) { return NULL; }
+#endif
+
 torch::Tensor broadcast(torch::Tensor src, torch::Tensor other, int64_t dim) {
   if (src.dim() == 1)
     for (auto i = 0; i < dim; i++)
diff --git a/csrc/segment_coo.cpp b/csrc/segment_coo.cpp
index b534e110..955b5295 100644
--- a/csrc/segment_coo.cpp
+++ b/csrc/segment_coo.cpp
@@ -1,3 +1,4 @@
+#include <Python.h>
 #include <torch/script.h>
 
 #include "cpu/segment_coo_cpu.h"
@@ -7,6 +8,10 @@
 #include "cuda/segment_coo_cuda.h"
 #endif
 
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__segment_coo(void) { return NULL; }
+#endif
+
 std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
 segment_coo_fw(torch::Tensor src, torch::Tensor index,
                torch::optional<torch::Tensor> optional_out,
diff --git a/csrc/segment_csr.cpp b/csrc/segment_csr.cpp
index eddd2649..c8d4814a 100644
--- a/csrc/segment_csr.cpp
+++ b/csrc/segment_csr.cpp
@@ -1,3 +1,4 @@
+#include <Python.h>
 #include <torch/script.h>
 
 #include "cpu/segment_csr_cpu.h"
@@ -7,6 +8,10 @@
 #include "cuda/segment_csr_cuda.h"
 #endif
 
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__segment_csr(void) { return NULL; }
+#endif
+
 std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
 segment_csr_fw(torch::Tensor src, torch::Tensor indptr,
                torch::optional<torch::Tensor> optional_out,
diff --git a/csrc/version.cpp b/csrc/version.cpp
new file mode 100644
index 00000000..0bc44861
--- /dev/null
+++ b/csrc/version.cpp
@@ -0,0 +1,21 @@
+#include <Python.h>
+#include <torch/script.h>
+
+#ifdef WITH_CUDA
+#include <cuda.h>
+#endif
+
+#ifdef _WIN32
+PyMODINIT_FUNC PyInit__version(void) { return NULL; }
+#endif
+
+int64_t cuda_version() {
+#ifdef WITH_CUDA
+  return CUDA_VERSION;
+#else
+  return -1;
+#endif
+}
+
+static auto registry =
+    torch::RegisterOperators().op("torch_scatter::cuda_version", &cuda_version);
diff --git a/script/.gitignore b/script/.gitignore
new file mode 100644
index 00000000..2d19fc76
--- /dev/null
+++ b/script/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/script/conda.sh b/script/conda.sh
new file mode 100755
index 00000000..cb8326f9
--- /dev/null
+++ b/script/conda.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ]; then
+  wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+  chmod +x miniconda.sh
+  ./miniconda.sh -b
+  PATH=/home/travis/miniconda3/bin:${PATH}
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "osx" ]; then
+  wget -nv https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh
+  chmod +x miniconda.sh
+  ./miniconda.sh -b
+  PATH=/Users/travis/miniconda3/bin:${PATH}
+fi
+
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ]; then
+  choco install openssl.light
+  choco install miniconda3
+  PATH=/c/tools/miniconda3/Scripts:$PATH
+fi
+
+conda update --yes conda
+
+conda create --yes -n test python="${PYTHON_VERSION}"
diff --git a/script/cuda.sh b/script/cuda.sh
new file mode 100755
index 00000000..b4853dd8
--- /dev/null
+++ b/script/cuda.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cpu" ]; then
+  export TOOLKIT=cpuonly
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu92" ]; then
+  export CUDA_SHORT=9.2
+  export CUDA=9.2.148-1
+  export UBUNTU_VERSION=ubuntu1604
+  export CUBLAS=cuda-cublas-dev-9-2
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu100" ]; then
+  export CUDA_SHORT=10.0
+  export CUDA=10.0.130-1
+  export UBUNTU_VERSION=ubuntu1804
+  export CUBLAS=cuda-cublas-dev-10-0
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "$IDX" = "cu101" ]; then
+  export IDX=cu101
+  export CUDA_SHORT=10.1
+  export CUDA=10.1.105-1
+  export UBUNTU_VERSION=ubuntu1804
+  export CUBLAS=libcublas-dev
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cpu" ]; then
+  export TOOLKIT=cpuonly
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu92" ]; then
+  export CUDA_SHORT=9.2
+  export CUDA_URL=https://developer.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod2/local_installers2
+  export CUDA_FILE=cuda_${CUDA_SHORT}.148_win10
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu100" ]; then
+  export CUDA_SHORT=10.0
+  export CUDA_URL=https://developer.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers
+  export CUDA_FILE=cuda_${CUDA_SHORT}.130_411.31_win10
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "$IDX" = "cu101" ]; then
+  export CUDA_SHORT=10.1
+  export CUDA_URL=https://developer.nvidia.com/compute/cuda/${CUDA_SHORT}/Prod/local_installers
+  export CUDA_FILE=cuda_${CUDA_SHORT}.105_418.96_win10.exe
+  export TOOLKIT="cudatoolkit=${CUDA_SHORT}"
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "osx" ] && [ "$IDX" = "cpu" ]; then
+  export TOOLKIT=""
+fi
+
+if [ "${IDX}" = "cpu" ]; then
+  export FORCE_CPU=1
+else
+  export FORCE_CUDA=1
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "linux" ] && [ "${IDX}" != "cpu" ]; then
+  INSTALLER=cuda-repo-${UBUNTU_VERSION}_${CUDA}_amd64.deb
+  wget -nv "http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/${INSTALLER}"
+  sudo dpkg -i "${INSTALLER}"
+  wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub"
+  sudo apt-key add 7fa2af80.pub
+  sudo apt update -qq
+  sudo apt install -y "cuda-core-${CUDA_SHORT/./-}" "cuda-cudart-dev-${CUDA_SHORT/./-}" "${CUBLAS}" "cuda-cusparse-dev-${CUDA_SHORT/./-}"
+  sudo apt clean
+  CUDA_HOME=/usr/local/cuda-${CUDA_SHORT}
+  LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
+  PATH=${CUDA_HOME}/bin:${PATH}
+  nvcc --version
+fi
+
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${IDX}" != "cpu" ]; then
+  wget -nv "${CUDA_URL}/${CUDA_FILE}"
+  PowerShell -Command "Start-Process -FilePath \"${CUDA_FILE}\" -ArgumentList \"-s nvcc_${CUDA_SHORT} cublas_dev_${CUDA_SHORT} cusparse_dev_${CUDA_SHORT}\" -Wait -NoNewWindow"
+  CUDA_HOME=/c/Program\ Files/NVIDIA\ GPU\ Computing\ Toolkit/CUDA/v${CUDA_SHORT}
+  PATH=${CUDA_HOME}/bin:$PATH
+  PATH=/c/Program\ Files\ \(x86\)/Microsoft\ Visual\ Studio/2017/BuildTools/MSBuild/15.0/Bin:$PATH
+  nvcc --version
+fi
+
+# Fix Cuda9.2 on Windows: https://github.com/pytorch/pytorch/issues/6109
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${IDX}" = "cu92" ]; then
+  sed -i.bak -e '129,141d' "${CUDA_HOME}/include/crt/host_config.h"
+fi
diff --git a/script/rename_wheel.py b/script/rename_wheel.py
new file mode 100644
index 00000000..9dfabd04
--- /dev/null
+++ b/script/rename_wheel.py
@@ -0,0 +1,24 @@
+import sys
+import os
+import os.path as osp
+import glob
+import shutil
+
+idx = sys.argv[1]
+assert idx in ['cpu', 'cu92', 'cu100', 'cu101']
+
+dist_dir = osp.join(osp.dirname(osp.abspath(__file__)), '..', 'dist')
+wheels = glob.glob(osp.join('dist', '**', '*.whl'), recursive=True)
+
+for wheel in wheels:
+    if idx in wheel:
+        continue
+
+    paths = wheel.split(osp.sep)
+    names = paths[-1].split('-')
+
+    name = '-'.join(names[:-4] + ['latest+' + idx] + names[-3:])
+    shutil.copyfile(wheel, osp.join(*paths[:-1], name))
+
+    name = '-'.join(names[:-4] + [names[-4] + '+' + idx] + names[-3:])
+    os.rename(wheel, osp.join(*paths[:-1], name))
diff --git a/script/torch.sh b/script/torch.sh
new file mode 100755
index 00000000..fe9ebfc7
--- /dev/null
+++ b/script/torch.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Fix "member may not be initialized" error on Windows: https://github.com/pytorch/pytorch/issues/27958
+if [ "${TRAVIS_OS_NAME}" = "windows" ] && [ "${IDX}" != "cpu" ]; then
+  sed -i.bak -e 's/constexpr/const/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/script/module.h
+  sed -i.bak -e 's/constexpr/const/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/torch/csrc/jit/argument_spec.h
+  sed -i.bak -e 's/return \*(this->value)/return \*((type\*)this->value)/g' /c/tools/miniconda3/envs/test/lib/site-packages/torch/include/pybind11/cast.h
+fi
+
diff --git a/script/wheel.py b/script/wheel.py
new file mode 100644
index 00000000..c8a702eb
--- /dev/null
+++ b/script/wheel.py
@@ -0,0 +1,46 @@
+import boto3
+
+s3_resource = boto3.resource('s3')
+bucket = s3_resource.Bucket(name="pytorch-scatter")
+objects = bucket.objects.all()
+wheels = sorted([obj.key for obj in objects if obj.key[-3:] == 'whl'])
+
+wheels_dict = {}
+for torch_version in list(set([wheel.split('/')[1] for wheel in wheels])):
+    wheels_dict[torch_version] = []
+
+for wheel in wheels:
+    torch_version = wheel.split('/')[1]
+    wheels_dict[torch_version].append(wheel)
+
+html = '<!DOCTYPE html>\n<html>\n<body>\n{}\n</body>\n</html>'
+href = '<a href="{}">{}</a><br/>'
+
+url = 'http://pytorch-scatter.s3-website.eu-central-1.amazonaws.com/{}.html'
+index_html = html.format('\n'.join([
+    href.format(url.format('whl/' + key), key) for key in wheels_dict.keys()
+]))
+
+with open('index.html', 'w') as f:
+    f.write(index_html)
+
+bucket.Object('whl/index.html').upload_file(
+    Filename='index.html', ExtraArgs={
+        'ContentType': 'text/html',
+        'ACL': 'public-read'
+    })
+
+url = 'https://pytorch-scatter.s3.eu-central-1.amazonaws.com/{}'
+for key, item in wheels_dict.items():
+    version_html = html.format('\n'.join([
+        href.format(url.format(i), '/'.join(i.split('/')[2:])) for i in item
+    ]))
+
+    with open('{}.html'.format(key), 'w') as f:
+        f.write(version_html)
+
+    bucket.Object('whl/{}.html'.format(key)).upload_file(
+        Filename='{}.html'.format(key), ExtraArgs={
+            'ContentType': 'text/html',
+            'ACL': 'public-read'
+        })
diff --git a/setup.py b/setup.py
index 3ab8cdf6..f91b520a 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,5 @@
 import os
 import os.path as osp
-import sys
 import glob
 from setuptools import setup, find_packages
 
@@ -11,7 +10,7 @@
 WITH_CUDA = torch.cuda.is_available() and CUDA_HOME is not None
 if os.getenv('FORCE_CUDA', '0') == '1':
     WITH_CUDA = True
-if os.getenv('FORCE_NON_CUDA', '0') == '1':
+if os.getenv('FORCE_CPU', '0') == '1':
     WITH_CUDA = False
 
 BUILD_DOCS = os.getenv('BUILD_DOCS', '0') == '1'
@@ -20,11 +19,7 @@
 def get_extensions():
     Extension = CppExtension
     define_macros = []
-    extra_compile_args = {'cxx': [], 'nvcc': []}
-
-    # Windows users: Edit both of these to contain your VS include path, i.e.:
-    # extra_compile_args['cxx'] += ['-I{VISUAL_STUDIO_DIR}\\include']
-    # extra_compile_args['nvcc'] += ['-I{VISUAL_STUDIO_DIR}\\include']
+    extra_compile_args = {'cxx': []}
 
     if WITH_CUDA:
         Extension = CUDAExtension
@@ -32,11 +27,7 @@ def get_extensions():
         nvcc_flags = os.getenv('NVCC_FLAGS', '')
         nvcc_flags = [] if nvcc_flags == '' else nvcc_flags.split(' ')
         nvcc_flags += ['-arch=sm_35', '--expt-relaxed-constexpr']
-        extra_compile_args['cxx'] += ['-O0']
-        extra_compile_args['nvcc'] += nvcc_flags
-
-    if sys.platform == 'win32':
-        extra_compile_args['cxx'] += ['/MP']
+        extra_compile_args['nvcc'] = nvcc_flags
 
     extensions_dir = osp.join(osp.dirname(osp.abspath(__file__)), 'csrc')
     main_files = glob.glob(osp.join(extensions_dir, '*.cpp'))
@@ -44,12 +35,18 @@ def get_extensions():
     for main in main_files:
         name = main.split(os.sep)[-1][:-4]
 
-        sources = [main, osp.join(extensions_dir, 'cpu', f'{name}_cpu.cpp')]
-        if WITH_CUDA:
-            sources += [osp.join(extensions_dir, 'cuda', f'{name}_cuda.cu')]
+        sources = [main]
+
+        path = osp.join(extensions_dir, 'cpu', f'{name}_cpu.cpp')
+        if osp.exists(path):
+            sources += [path]
+
+        path = osp.join(extensions_dir, 'cuda', f'{name}_cuda.cu')
+        if WITH_CUDA and osp.exists(path):
+            sources += [path]
 
         extension = Extension(
-            f'torch_scatter._{name}',
+            'torch_scatter._' + name,
             sources,
             include_dirs=[extensions_dir],
             define_macros=define_macros,
@@ -66,13 +63,14 @@ def get_extensions():
 
 setup(
     name='torch_scatter',
-    version='2.0.2',
+    version='2.0.3',
     author='Matthias Fey',
     author_email='matthias.fey@tu-dortmund.de',
     url='https://github.com/rusty1s/pytorch_scatter',
     description='PyTorch Extension Library of Optimized Scatter Operations',
     keywords=['pytorch', 'scatter', 'segment', 'gather'],
     license='MIT',
+    python_requires='>=3.6',
     install_requires=install_requires,
     setup_requires=setup_requires,
     tests_require=tests_require,
diff --git a/test/test_scatter.py b/test/test_scatter.py
index 8e43844b..edec96ac 100644
--- a/test/test_scatter.py
+++ b/test/test_scatter.py
@@ -91,10 +91,10 @@ def test_forward(test, reduce, dtype, device):
     dim = test['dim']
     expected = tensor(test[reduce], dtype, device)
 
-    out = getattr(torch_scatter, f'scatter_{reduce}')(src, index, dim)
+    out = getattr(torch_scatter, 'scatter_' + reduce)(src, index, dim)
     if isinstance(out, tuple):
         out, arg_out = out
-        arg_expected = tensor(test[f'arg_{reduce}'], torch.long, device)
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
         assert torch.all(arg_out == arg_expected)
     assert torch.all(out == expected)
 
@@ -121,7 +121,7 @@ def test_out(test, reduce, dtype, device):
 
     out = torch.full_like(expected, -2)
 
-    getattr(torch_scatter, f'scatter_{reduce}')(src, index, dim, out)
+    getattr(torch_scatter, 'scatter_' + reduce)(src, index, dim, out)
 
     if reduce == 'sum' or reduce == 'add':
         expected = expected - 2
@@ -150,9 +150,9 @@ def test_non_contiguous(test, reduce, dtype, device):
     if index.dim() > 1:
         index = index.transpose(0, 1).contiguous().transpose(0, 1)
 
-    out = getattr(torch_scatter, f'scatter_{reduce}')(src, index, dim)
+    out = getattr(torch_scatter, 'scatter_' + reduce)(src, index, dim)
     if isinstance(out, tuple):
         out, arg_out = out
-        arg_expected = tensor(test[f'arg_{reduce}'], torch.long, device)
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
         assert torch.all(arg_out == arg_expected)
     assert torch.all(out == expected)
diff --git a/test/test_segment.py b/test/test_segment.py
index 7b6bb39e..a5c28785 100644
--- a/test/test_segment.py
+++ b/test/test_segment.py
@@ -91,17 +91,17 @@ def test_forward(test, reduce, dtype, device):
     indptr = tensor(test['indptr'], torch.long, device)
     expected = tensor(test[reduce], dtype, device)
 
-    out = getattr(torch_scatter, f'segment_{reduce}_csr')(src, indptr)
+    out = getattr(torch_scatter, 'segment_' + reduce + '_csr')(src, indptr)
     if isinstance(out, tuple):
         out, arg_out = out
-        arg_expected = tensor(test[f'arg_{reduce}'], torch.long, device)
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
         assert torch.all(arg_out == arg_expected)
     assert torch.all(out == expected)
 
-    out = getattr(torch_scatter, f'segment_{reduce}_coo')(src, index)
+    out = getattr(torch_scatter, 'segment_' + reduce + '_coo')(src, index)
     if isinstance(out, tuple):
         out, arg_out = out
-        arg_expected = tensor(test[f'arg_{reduce}'], torch.long, device)
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
         assert torch.all(arg_out == arg_expected)
     assert torch.all(out == expected)
 
@@ -129,12 +129,12 @@ def test_out(test, reduce, dtype, device):
 
     out = torch.full_like(expected, -2)
 
-    getattr(torch_scatter, f'segment_{reduce}_csr')(src, indptr, out)
+    getattr(torch_scatter, 'segment_' + reduce + '_csr')(src, indptr, out)
     assert torch.all(out == expected)
 
     out.fill_(-2)
 
-    getattr(torch_scatter, f'segment_{reduce}_coo')(src, index, out)
+    getattr(torch_scatter, 'segment_' + reduce + '_coo')(src, index, out)
 
     if reduce == 'sum' or reduce == 'add':
         expected = expected - 2
@@ -165,16 +165,16 @@ def test_non_contiguous(test, reduce, dtype, device):
     if indptr.dim() > 1:
         indptr = indptr.transpose(0, 1).contiguous().transpose(0, 1)
 
-    out = getattr(torch_scatter, f'segment_{reduce}_csr')(src, indptr)
+    out = getattr(torch_scatter, 'segment_' + reduce + '_csr')(src, indptr)
     if isinstance(out, tuple):
         out, arg_out = out
-        arg_expected = tensor(test[f'arg_{reduce}'], torch.long, device)
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
         assert torch.all(arg_out == arg_expected)
     assert torch.all(out == expected)
 
-    out = getattr(torch_scatter, f'segment_{reduce}_coo')(src, index)
+    out = getattr(torch_scatter, 'segment_' + reduce + '_coo')(src, index)
     if isinstance(out, tuple):
         out, arg_out = out
-        arg_expected = tensor(test[f'arg_{reduce}'], torch.long, device)
+        arg_expected = tensor(test['arg_' + reduce], torch.long, device)
         assert torch.all(arg_out == arg_expected)
     assert torch.all(out == expected)
diff --git a/test/utils.py b/test/utils.py
index 1eb352b2..4decd25c 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -7,7 +7,7 @@
 
 devices = [torch.device('cpu')]
 if torch.cuda.is_available():
-    devices += [torch.device('cuda:{}'.format(torch.cuda.current_device()))]
+    devices += [torch.device(f'cuda:{torch.cuda.current_device()}')]
 
 
 def tensor(x, dtype, device):
diff --git a/torch_scatter/__init__.py b/torch_scatter/__init__.py
index 50e24ea8..8c6114f1 100644
--- a/torch_scatter/__init__.py
+++ b/torch_scatter/__init__.py
@@ -1,3 +1,26 @@
+# flake8: noqa
+
+import importlib
+import os.path as osp
+
+import torch
+
+__version__ = '2.0.3'
+expected_torch_version = (1, 4)
+
+try:
+    torch.ops.load_library(importlib.machinery.PathFinder().find_spec(
+        '_version', [osp.dirname(__file__)]).origin)
+except OSError as e:
+    if 'undefined symbol' in str(e):
+        major, minor = [int(x) for x in torch.__version__.split('.')[:2]]
+        t_major, t_minor = expected_torch_version
+        if major != t_major or (major == t_major and minor != t_minor):
+            raise RuntimeError(
+                f'Expected PyTorch version {t_major}.{t_minor} but found '
+                f'version {major}.{minor}.')
+    raise OSError(e)
+
 from .scatter import (scatter_sum, scatter_add, scatter_mean, scatter_min,
                       scatter_max, scatter)
 from .segment_csr import (segment_sum_csr, segment_add_csr, segment_mean_csr,
@@ -9,7 +32,22 @@
 from .composite import (scatter_std, scatter_logsumexp, scatter_softmax,
                         scatter_log_softmax)
 
-__version__ = '2.0.2'
+cuda_version = torch.ops.torch_scatter.cuda_version()
+if cuda_version != -1 and torch.version.cuda is not None:  # pragma: no cover
+    if cuda_version < 10000:
+        major, minor = int(str(cuda_version)[0]), int(str(cuda_version)[2])
+    else:
+        major, minor = int(str(cuda_version)[0:2]), int(str(cuda_version)[3])
+    t_major, t_minor = [int(x) for x in torch.version.cuda.split('.')]
+    cuda_version = str(major) + '.' + str(minor)
+
+    if t_major != major or t_minor != minor:
+        raise RuntimeError(
+            f'Detected that PyTorch and torch_scatter were compiled with '
+            f'different CUDA versions. PyTorch has CUDA version '
+            f'{t_major}.{t_minor} and torch_scatter has CUDA version '
+            f'{major}.{minor}. Please reinstall the torch_scatter that '
+            f'matches your PyTorch install.')
 
 __all__ = [
     'scatter_sum',
diff --git a/torch_scatter/scatter.py b/torch_scatter/scatter.py
index 3213fbca..df84090d 100644
--- a/torch_scatter/scatter.py
+++ b/torch_scatter/scatter.py
@@ -1,4 +1,4 @@
-import warnings
+import importlib
 import os.path as osp
 from typing import Optional, Tuple
 
@@ -6,21 +6,8 @@
 
 from .utils import broadcast
 
-try:
-    torch.ops.load_library(
-        osp.join(osp.dirname(osp.abspath(__file__)), '_scatter.so'))
-except OSError:
-    warnings.warn('Failed to load `scatter` binaries.')
-
-    def scatter_with_arg_placeholder(src: torch.Tensor, index: torch.Tensor,
-                                     dim: int, out: Optional[torch.Tensor],
-                                     dim_size: Optional[int]
-                                     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        raise ImportError
-        return src, index
-
-    torch.ops.torch_scatter.scatter_min = scatter_with_arg_placeholder
-    torch.ops.torch_scatter.scatter_max = scatter_with_arg_placeholder
+torch.ops.load_library(importlib.machinery.PathFinder().find_spec(
+    '_scatter', [osp.dirname(__file__)]).origin)
 
 
 @torch.jit.script
diff --git a/torch_scatter/segment_coo.py b/torch_scatter/segment_coo.py
index e6ef936d..3fc33bbd 100644
--- a/torch_scatter/segment_coo.py
+++ b/torch_scatter/segment_coo.py
@@ -1,38 +1,11 @@
-import warnings
+import importlib
 import os.path as osp
 from typing import Optional, Tuple
 
 import torch
 
-try:
-    torch.ops.load_library(
-        osp.join(osp.dirname(osp.abspath(__file__)), '_segment_coo.so'))
-except OSError:
-    warnings.warn('Failed to load `segment_coo` binaries.')
-
-    def segment_coo_placeholder(src: torch.Tensor, index: torch.Tensor,
-                                out: Optional[torch.Tensor],
-                                dim_size: Optional[int]) -> torch.Tensor:
-        raise ImportError
-        return src
-
-    def segment_coo_with_arg_placeholder(
-            src: torch.Tensor, index: torch.Tensor,
-            out: Optional[torch.Tensor],
-            dim_size: Optional[int]) -> Tuple[torch.Tensor, torch.Tensor]:
-        raise ImportError
-        return src, index
-
-    def gather_coo_placeholder(src: torch.Tensor, index: torch.Tensor,
-                               out: Optional[torch.Tensor]) -> torch.Tensor:
-        raise ImportError
-        return src
-
-    torch.ops.torch_scatter.segment_sum_coo = segment_coo_placeholder
-    torch.ops.torch_scatter.segment_mean_coo = segment_coo_placeholder
-    torch.ops.torch_scatter.segment_min_coo = segment_coo_with_arg_placeholder
-    torch.ops.torch_scatter.segment_max_coo = segment_coo_with_arg_placeholder
-    torch.ops.torch_scatter.gather_coo = gather_coo_placeholder
+torch.ops.load_library(importlib.machinery.PathFinder().find_spec(
+    '_segment_coo', [osp.dirname(__file__)]).origin)
 
 
 @torch.jit.script
diff --git a/torch_scatter/segment_csr.py b/torch_scatter/segment_csr.py
index d7e5ad95..44aed216 100644
--- a/torch_scatter/segment_csr.py
+++ b/torch_scatter/segment_csr.py
@@ -1,36 +1,11 @@
-import warnings
+import importlib
 import os.path as osp
 from typing import Optional, Tuple
 
 import torch
 
-try:
-    torch.ops.load_library(
-        osp.join(osp.dirname(osp.abspath(__file__)), '_segment_csr.so'))
-except OSError:
-    warnings.warn('Failed to load `segment_csr` binaries.')
-
-    def segment_csr_placeholder(src: torch.Tensor, indptr: torch.Tensor,
-                                out: Optional[torch.Tensor]) -> torch.Tensor:
-        raise ImportError
-        return src
-
-    def segment_csr_with_arg_placeholder(
-            src: torch.Tensor, indptr: torch.Tensor,
-            out: Optional[torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]:
-        raise ImportError
-        return src, indptr
-
-    def gather_csr_placeholder(src: torch.Tensor, indptr: torch.Tensor,
-                               out: Optional[torch.Tensor]) -> torch.Tensor:
-        raise ImportError
-        return src
-
-    torch.ops.torch_scatter.segment_sum_csr = segment_csr_placeholder
-    torch.ops.torch_scatter.segment_mean_csr = segment_csr_placeholder
-    torch.ops.torch_scatter.segment_min_csr = segment_csr_with_arg_placeholder
-    torch.ops.torch_scatter.segment_max_csr = segment_csr_with_arg_placeholder
-    torch.ops.torch_scatter.gather_csr = gather_csr_placeholder
+torch.ops.load_library(importlib.machinery.PathFinder().find_spec(
+    '_segment_csr', [osp.dirname(__file__)]).origin)
 
 
 @torch.jit.script