./julia-lab.sh
then, in terminal: cp .ssh/* ~/.ssh/
note: after installing the kernel, you need to copy the kernel over: cp -R home/jhgilles.local/share/jupyter/kernels/julia-1.1 /data/scratch/jhgilles/julia/share/jupyter/kernels/julia-1.1
104.197.28.58
In gcloud: new compute node, ubuntu 19.10
add Network Tags
: jupyter, allow-jupyter, allow-tensorboard, allow-8889, allow-mosh, allow-dask
env TERM=xterm-256color ssh -i ~/.ssh/google_compute_engine [email protected]
# debian deps
sudo apt-get update && sudo apt-get upgrade -y
sudo apt-get install git vim wget tmux -y
# config files (on local)
# cd adv_lth
scp -i ~/.ssh/google_compute_engine -r cloud/supervisor/* cloud/supervisor/.* [email protected]:/home/radical/
# check for gpu
lspci | grep NVIDIA
# install drivers
# (maybe should use "headless"?)
sudo apt install nvidia-cuda-dev nvidia-headless-435 nvidia-utils-435 nvidia-profiler nvidia-cuda-toolkit
# reboot
sudo reboot now
# ... reconnect ...
# check install
nvidia-smi
nvcc --version
# try samples
# https://xcat-docs.readthedocs.io/en/stable/advanced/gpu/nvidia/verify_cuda_install.html
git clone --single-branch --depth 1 --branch 10.1 https://github.com/NVIDIA/cuda-samples cuda-samples
cd cuda-samples/Samples/deviceQuery
# replace /usr/local/cuda with /usr/ in Makefile, then:
make
./deviceQuery
cd ~
# the repo
git clone [email protected]:kazimuth/6.338 6.338
# anaconda
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh
# our env
conda create -p ./julia
conda activate ./julia
# jupyterlab + cuda + julia
conda install -c conda-forge jupyterlab julia -y
# in julia:
julia
]
add CUDAdrv CUDAnative CuArrays IJulia Flux Plots
test CuArrays
# certificates for jupyterlab
openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout mykey.key -out mycert.pem
Note: jupyterlab configuration is in [adv_lth/]cloud/supervisor/.jupyter/jupyter_notebook_config.py see https://jupyter-notebook.readthedocs.io/en/stable/public_server.html#running-a-public-notebook-server
tmux # or tmux a
conda activate ./env
jupyter lab
open https://104.197.28.58:8888 note: jupyterlab password is “grumpiness ampersand”
. /data/scratch/jhgilles/miniconda3/bin/activate
conda activate /data/scratch/jhgilles/julia
export JUPYTER_CONFIG_DIR=/data/scratch/jhgilles/.jupyter
#export CLOUDSDK_CONFIG=/data/scratch/jhgilles/.gcloud
#export GOOGLE_APPLICATION_CREDENTIALS=/data/scratch/jhgilles/adv-lth/cloud/secrets/service-account-private-key.json
#export PYTHONPATH=/data/scratch/jhgilles/adv-lth/tinylth/gpu-src:/data/scratch/jhgilles/adv-lth/tinylth/lottery:$PYTHONPATH
export JULIA_DEPOT_PATH=/data/scratch/jhgilles/.julia
ignore the following
name: null channels: - conda-forge - defaults dependencies: - _libgcc_mutex=0.1=main - arpack=3.6.3=blas_openblash6d6ea35_1002 - attrs=19.3.0=py_0 - backcall=0.1.0=py_0 - blas=1.1=openblas - bleach=3.1.0=py_0 - ca-certificates=2019.11.28=hecc5488_0 - certifi=2019.11.28=py38_0 - cudatoolkit=10.1.243=h6bb024c_0 - curl=7.65.3=hf8cf82a_0 - decorator=4.4.1=py_0 - defusedxml=0.6.0=py_0 - entrypoints=0.3=py38_1000 - fftw=3.3.8=nompi_h7f3a6c3_1110 - gmp=6.1.2=hf484d3e_1000 - importlib_metadata=1.2.0=py38_0 - ipykernel=5.1.3=py38h5ca1d4c_0 - ipython=7.10.1=py38h5ca1d4c_0 - ipython_genutils=0.2.0=py_1 - jedi=0.15.1=py38_0 - jinja2=2.10.3=py_0 - json5=0.8.5=py_0 - jsonschema=3.2.0=py38_0 - julia=1.0.3=blas_openblash12d65f3_2 - jupyter_client=5.3.3=py38_1 - jupyter_core=4.6.1=py38_0 - jupyterlab=1.2.3=py_0 - jupyterlab_server=1.0.6=py_0 - krb5=1.16.3=h05b26f9_1001 - ld_impl_linux-64=2.33.1=h53a641e_7 - libblas=3.8.0=11_openblas - libcblas=3.8.0=11_openblas - libcurl=7.65.3=hda55be3_0 - libedit=3.1.20170329=hf8c457e_1001 - libffi=3.2.1=he1b5a44_1006 - libgcc-ng=9.2.0=hdf63c60_0 - libgfortran-ng=7.3.0=hdf63c60_2 - libgit2=0.28.3=h241e3f0_0 - liblapack=3.8.0=11_openblas - libopenblas=0.3.6=h5a2b251_2 - libsodium=1.0.17=h516909a_0 - libssh2=1.8.2=h22169c7_2 - libstdcxx-ng=9.2.0=hdf63c60_0 - markupsafe=1.1.1=py38h516909a_0 - metis=5.1.0=he1b5a44_1005 - mistune=0.8.4=py38h516909a_1000 - more-itertools=8.0.2=py_0 - mpfr=4.0.2=he80fd80_0 - nbconvert=5.6.1=py38_0 - nbformat=4.4.0=py_1 - ncurses=6.1=hf484d3e_1002 - notebook=6.0.1=py38_0 - openblas=0.3.3=h9ac9557_1001 - openlibm=0.5.4=h14c3975_1000 - openspecfun=0.5.3=hc99cbb1_1001 - openssl=1.1.1d=h516909a_0 - pandoc=2.8.1=0 - pandocfilters=1.4.2=py_1 - parso=0.5.1=py_0 - pcre2=10.23=2 - pexpect=4.7.0=py38_0 - pickleshare=0.7.5=py38_1000 - pip=19.3.1=py38_0 - prometheus_client=0.7.1=py_0 - prompt_toolkit=3.0.2=py_0 - ptyprocess=0.6.0=py_1001 - pygments=2.5.2=py_0 - pyrsistent=0.15.6=py38h516909a_0 - python=3.8.0=h357f687_5 - python-dateutil=2.8.1=py_0 - pyzmq=18.1.1=py38h1768529_0 - readline=8.0=hf8c457e_0 - send2trash=1.5.0=py_0 - setuptools=42.0.2=py38_0 - six=1.13.0=py38_0 - sqlite=3.30.1=hcee41ef_0 - suitesparse=4.5.6=blas_openblash17e8c26_1201 - tbb=2019.9=hc9558a2_1 - terminado=0.8.3=py38_0 - testpath=0.4.4=py_0 - tk=8.6.10=hed695b0_0 - tornado=6.0.3=py38h516909a_0 - traitlets=4.3.3=py38_0 - wcwidth=0.1.7=py_1 - webencodings=0.5.1=py_1 - wheel=0.33.6=py38_0 - xz=5.2.4=h14c3975_1001 - zeromq=4.3.2=he1b5a44_2 - zipp=0.6.0=py_0 - zlib=1.2.11=h516909a_1006https://arxiv.org/abs/1711.11294
backprop sources: https://mitmath.github.io/18337/lecture10/estimation_identification https://mitmath.github.io/18337/lecture11/adjoints https://math.mit.edu/~stevenj/18.336/adjoint.pdf https://www.cs.toronto.edu/~rgrosse/courses/csc321_2018/slides/lec06.pdf
see also: https://github.com/layog/Accurate-Binary-Convolution-Network/blob/master/ABC.ipynb
$$α_i, \Wa_i = arg minα_i, \Wa_i \norm{\Wv - ∑_i α_i \Wa_i}^2$$ (1)
Could solve it numerically, i.e. through gradient descent, but you wouldn’t be able to backpropagate through to real valued weights.
Instead, assume:
Where
Now (1) becomes a linear regression problem (3). During training, we can fit
So:
(note:
$$\pderiv{l}{W} \;\steeq\; \pderiv{l}{Fu_i(\Wv)}$$ )
Then:
We can also approximate weights channel-wise: do the above operations for every output channel.
Note that when
- other ways to select weights: to minimize output error, minimize outlier error, …
- smt solver?
- prune weights, then find mask? can you strip out the pruned weights tho? i don’t think so.
- is there some way to binarize to represent 0 more easily?
-1 & \mathrm{otherwise} \end{cases}$$
note: use batchnorm before activation to keep activations about where you’d want. then:
Both
max pooling on a binary network is basically useless. so, do it before batchnorm + activation
first, train normally. Then switch things
https://arxiv.org/pdf/1711.11294.pdf#subsection.3.3
"""Learned quantization operators inspired by https://github.com/Microsoft/LQ-Nets"""
import numpy as np
import tvm
from tvm.contrib.dlpack import to_pytorch_func
import warnings
import torch
import torch.nn
import torch.utils.dlpack
import itertools
import sys
from typing import List, Tuple
def levels(basis: np.ndarray):
product_args = [[-1, +1] for e in basis]
levels = list(itertools.product(*product_args))
levels.sort(key=lambda l: l @ basis)
return [(l @ basis, l) for l in levels]
def splits(basis: np.ndarray):
# returns [thresh, bits]
# if prev_thresh(init -inf) < v < thresh, use val
# last thresh is +inf
levs = levels(basis)
res = []
for (l, bits), (lnext, _) in zip(levs, levs[1:]):
res.append(((l + lnext) / 2, bits))
res.append((np.infty, levs[-1][1]))
return res
def fsplits(basis: np.ndarray):
# returns [thresh, val]
# if prev_thresh < v < thresh, use val
# last thresh is +inf
res = []
s = splits(basis)
for (t, b) in s:
res.append((t, b @ basis))
return res
def quantize(arr: np.ndarray, basis: np.ndarray):
result = np.empty(arr.shape + (len(basis),), dtype='int8')
unset = np.ones(arr.shape, dtype='bool')
for thresh, bits in splits(basis):
to_set = (arr < thresh) & unset
unset[to_set] = False
result[to_set] = bits
return result
def fquantize(arr: np.ndarray, basis: np.ndarray) -> np.ndarray:
result = np.empty(arr.shape, dtype='float32')
unset = np.ones(arr.shape, dtype='bool')
for thresh, val in fsplits(basis):
to_set = (arr < thresh) & unset
unset[to_set] = False
result[to_set] = val
return result
def evaluate(bit_arr: np.ndarray, basis: np.ndarray):
return np.tensordot(bit_arr, basis, (1, 0))
def threshbits(basis):
s = splits(basis)
thresh = []
bits = []
for (t, bb) in s:
thresh.append(t)
bits.append(bb)
thresh.pop()
return np.array(thresh), np.array(bits)
def threshval(basis: np.ndarray):
fs: np.ndarray = np.array(fsplits(basis))
return (fs[:-1, 0], fs[:, 1])
class memodict(dict):
def __init__(self, fn):
self.fn = fn
self.warned = False
def __repr__(self):
return f'memodict({self.fn}, len={len(self)})'
def __missing__(self, k):
args, kwargs = k
ret = self.fn(*args, **dict(kwargs))
if len(self) >= 100 and not self.warned:
self.warned = True
warnings.warn(
'More than 100 entries in the memodict, consider .clear()ing it')
self[k] = ret
return ret
def __call__(self, *args, **kwargs):
key = (tuple(args), tuple(kwargs.items()))
return self[key]
def _tvm_quantize_inner(v, bits, Thresh, Value, cursor=None, depth=0):
'''The contents of the inner loop of a quantization operation.
Quantizes such that if Thresh[i-1] < v < Thresh[i], Value[i] is chosen.
Formulated so that it requires no instruction branches, only a series of compares:
Performs a binary search where each step either adds to or subtracts from the bounds.
v: the tvm.Expr to quantize
bits: the bitwidth to quantize to
Thresh: the tvm.placeholder to pull quantization thresholds from.
Value: the tvm.placeholder to pull quantized values from.
depth, lo, hi: search bounds
'''
width = int(2 ** bits)
# visualization:
# bits = 3, width = 8
#
#
# 0 1 2 3 4 5 6 7 <- Value
# 0 1 2 3 4 5 6 <- Thresh
# .|.|.|.|.|.|.|.
# ^ 0 <- depth
# ^ ^ 1
# ^ ^ ^ ^ 2
# . . . . . . . . 3
# note: both cases can both fire if width=2
if depth == 0:
# top
cursor = int(width // 2 - 1)
if depth == np.log2(width) - 1:
# base
return tvm.expr.Select(v < Thresh[cursor], Value(cursor), Value(cursor+1))
next_cursor = tvm.var(f'cursor{depth+1}', dtype='int32')
delta = int(width // 2**(depth + 2))
return tvm.expr.Let(
# let
next_cursor,
# =
tvm.expr.Select(v < Thresh[cursor], cursor - delta, cursor + delta),
# in:
_tvm_quantize_inner(v, bits, Thresh, Value, next_cursor, depth+1)
)
def _tvm_learned_quantize(bits,
n=tvm.var('n'),
c=tvm.var('c'),
y=tvm.var('y'),
x=tvm.var('x'),
target='llvm',
extrabit=False,
floating_point=False):
'''Build an operator to perform a learned-quantize operation.
floating_point: whether to output a single floating-point number or multiple +1/-1 floating bit numbers
extrabit: whether to add a final extra bit in floating_point=False mode that is always +1 (for constant offset)
'''
values = int(2 ** bits)
In = tvm.placeholder((n, c, y, x), name='In')
Threshes = tvm.placeholder((values-1,), name='Threshes')
if floating_point:
Vals = tvm.placeholder((values,), name='Vals')
Out = tvm.compute((n, c, y, x),
lambda nn, cc, yy, xx: _tvm_quantize_inner(
In(nn, cc, yy, xx), bits, Threshes, Vals
),
'Out')
elif extrabit:
Vals = tvm.placeholder((values, bits), name='Vals')
Out = tvm.compute((n, c, y, x, bits+1),
lambda nn, cc, yy, xx, bb: tvm.expr.Select(bb == bits,
tvm.const(+1, 'float32'),
_tvm_quantize_inner(
In(nn, cc, yy, xx), bits, Threshes, lambda i: Vals[i, bb],
),
),
'Out')
else: # normal n bits
Vals = tvm.placeholder((values, bits), name='Vals')
Out = tvm.compute((n, c, y, x, bits),
lambda nn, cc, yy, xx, bb: _tvm_quantize_inner(
In(nn, cc, yy, xx), bits, Threshes, lambda i: Vals[i, bb]
),
'Out')
s = tvm.create_schedule(Out.op)
if target == 'llvm':
# just fuse the outer loop; llvm will handle vectorization
nc = s[Out].fuse(Out.op.axis[0], Out.op.axis[1])
s[Out].parallel(nc)
elif target == 'opencl' or target == 'cuda':
# thread variables
block_x = tvm.thread_axis("blockIdx.x")
block_y = tvm.thread_axis("blockIdx.y")
block_z = tvm.thread_axis("blockIdx.z")
thread_x = tvm.thread_axis((0, 16), "threadIdx.x")
thread_y = tvm.thread_axis((0, 16), "threadIdx.y")
if floating_point:
nn, cc, yy, xx = Out.op.axis
else:
nn, cc, yy, xx, bb = Out.op.axis
nc = s[Out].fuse(nn, cc)
s[Out].bind(nc, block_z)
oy, iy = s[Out].split(yy, factor=16)
ox, ix = s[Out].split(xx, factor=16)
s[Out].bind(oy, block_y)
s[Out].bind(ox, block_x)
s[Out].bind(iy, thread_y)
s[Out].bind(ix, thread_x)
# TODO: cache thresholds? multiple ops per thread?
return tvm.build(s, [In, Threshes, Vals, Out],
name=f'quantize_bits_{target}_{n}_{c}_{y}_{x}',
target=target)
@memodict
def _torch_learned_quantize(bits, extrabit=False, floating_point=False, target='llvm'):
res = _tvm_learned_quantize(
bits, extrabit=extrabit, floating_point=floating_point, target=target)
def q(*args):
result = []
for i, arg in enumerate(args):
if isinstance(arg, torch.Tensor):
pak = torch.utils.dlpack.to_dlpack(arg)
#from . import inspect
# inspect.inspect(pak)
#print(f'arg {i} is tensor, dlpacking')
result.append(tvm.nd.from_dlpack(
torch.utils.dlpack.to_dlpack(arg)))
# result.append(tvm.nd.array(arg.detach().cpu().numpy().copy()))
# print(result[-1].asnumpy().ndim)
# print('done')
return res(*result)
#fn = to_pytorch_func(q)
return q
class LearnedQuantizeOpFloat(torch.autograd.Function):
@staticmethod
def forward(ctx, input, thresh, vals):
assert thresh.shape[0] == vals.shape[0] - \
1, 'mismatched thresh/val size'
bits = np.log2(vals.shape[0])
assert bits == int(bits), 'vals shape not power of 2'
target = 'cuda' if input.is_cuda else 'llvm'
output = input.new_empty(input.shape)
fn = _torch_learned_quantize(bits, floating_point=True, target=target)
fn(input, thresh, vals, output)
return output
@staticmethod
def backward(ctx, output_grad):
# "straight-through estimator"
return output_grad, None, None
# class LearnedQuantizeOpBits
def learned_quantize(input, basis, floating_point=True, extrabit=False):
'''Apply a learned quantization operation.
input: the tensor to quantize
basis: the basis to quantize against
floating_point: whether to return the quantization result as an individual floating point number,
or as multiple +/-1 float32s
'''
if floating_point:
thresh, val = threshval(basis.cpu().detach().numpy())
else:
thresh, val = threshbits(basis.cpu().detach().numpy())
thresh = torch.from_numpy(thresh.astype('float32'))
val = torch.from_numpy(val.astype('float32'))
if input.is_cuda:
thresh = thresh.cuda()
val = val.cuda()
if floating_point:
return LearnedQuantizeOpFloat.apply(input, thresh, val)
else:
# print(basis)
bits = basis.shape[0]
if extrabit:
bits += 1
target = 'cuda' if input.is_cuda else 'llvm'
fn = _torch_learned_quantize(
bits, floating_point=False, target=target, extrabit=extrabit)
#fn = _torch_learned_quantize(bits, floating_point=False, target=target)
output = input.new_empty(tuple(input.shape) + (bits,))
fn(input, thresh, val, output)
return output
def next_basis(x, basis, extrabit):
assert len(x.shape) == 4
assert len(basis.shape) == 1
bits = basis.shape[0]
# n c h w b
quantized = learned_quantize(
x, basis, floating_point=False, extrabit=extrabit)
# nchw b
quantized = quantized.view(-1, bits)
# nchw 1
targets = x.view(-1, 1)
# max(nchw, b) 1
# answer is first b bits
result, _ = torch.gels(targets, quantized)
result = result[:bits].view(-1)
return result
class Quantize(torch.nn.Module):
def __init__(self, bits, avg_factor=.9, init_type='asc', moving_center=False, extrabit=False):
super().__init__()
self.register_buffer('_bits', torch.Tensor([bits]))
self.register_buffer('_avg_factor', torch.Tensor([avg_factor]))
self.register_buffer('_moving_center', torch.Tensor([moving_center]))
self.register_buffer('_extrabit', torch.Tensor([extrabit]))
if init_type == 'asc':
self.register_buffer('basis',
torch.Tensor(list(range(1, self.bits+1))) / float(self.bits))
else:
raise UnimplementedError(f'No such init_type: "{init_type}"')
def __repr__(self):
return f'Quantize(bits={self.bits}, basis={self.basis})'
def cuda(self):
super().cuda()
self._bits.cpu()
self._avg_factor.cpu()
self._moving_center.cpu()
@property
def bits(self):
return int(self._bits[0])
@property
def avg_factor(self):
return int(self._avg_factor[0])
@property
def moving_center(self):
return bool(self._moving_center[0])
@property
def extrabit(self):
return bool(self._moving_center[0])
def forward(self, x):
if self.training:
newbasis = next_basis(x, self.basis, self.extrabit)
self.basis = self.basis * self.avg_factor + \
newbasis * (1.0-self.avg_factor)
return learned_quantize(x, self.basis)
https://docs.google.com/presentation/d/1l-BuAtyKgoVYakJSijaSqaTL3friESDyTOnU2OLqGoA/edit#slide=id.p
@cuprintf @cuda threads=… f(args) @device_code_typed @cuda f(args) @device_code_llvm @cuda f(args) @device_code_sass @cuda f(args) @device_code_ptx @cuda f(args)
@device_codelowered,typed,warntype,llvm,ptx,sass
julia> const x = CuArray{Float32}(undef, 1024) julia> using BenchmarkTools julia> @benchmark CuArrays.@sync(identity.($x)) BenchmarkTools.Trial: minimum time: 13.824 μs (0.00% GC) maximum time: 401.689 μs (0.00% GC) julia> CuArrays.@time CuArrays.@sync identity.(x); 0.000378 seconds (57 CPU allocations: 1.938 KiB) (1 GPU allocation: 4.000 KiB) julia> using CUDAdrv julia> CUDAdrv.@elapsed identity.(x) 5.888f-6
@benchmark CuArrays.@sync … CuArrays.@time CuArrays.@sync identity
can use map, reduce, broadcast on cuda
vendor libraries are transparently used (except CUFFT)
don’t iterate scalarly avoid multiple kernels bump the problem size keep data on GPU
strengths:
- composable
-
Flux provides a set of helpers for custom layers, which you can enable by calling
Flux.@functor Affine
This enables a useful extra set of functionality for our Affine layer, such as collecting its parameters or moving it to the GPU.
conv: defined in flux, called in nnlib; zygote defines adjoint
note: technically not a convolution, it’s a correlation. >:/ https://fluxml.ai/Flux.jl/stable/models/layers/#Flux.Conv https://github.com/FluxML/Flux.jl/blob/f46b5243dbc762081ccfbe991690750b307e2fc2/src/layers/conv.jl#L5-L25 https://github.com/FluxML/NNlib.jl/blob/master/src/conv.jl
note: takes data in order WHCN (but, fortran order. so actually NCHW) need to define a conv with an alternate data order somehow. how does this plug into differentiation? Zygote.jl,
"""
Conv(size, in=>out)
Conv(size, in=>out, relu)
Standard convolutional layer. `size` should be a tuple like `(2, 2)`.
`in` and `out` specify the number of input and output channels respectively.
Example: Applying Conv layer to a 1-channel input using a 2x2 window size,
giving us a 16-channel output. Output is activated with ReLU.
size = (2,2)
in = 1
out = 16
Conv((2, 2), 1=>16, relu)
Data should be stored in WHCN order (width, height, # channels, # batches).
In other words, a 100×100 RGB image would be a `100×100×3×1` array,
and a batch of 50 would be a `100×100×3×50` array.
Takes the keyword arguments `pad`, `stride` and `dilation`.
"""
struct Conv{N,M,F,A,V}
σ::F
weight::A
bias::V
stride::NTuple{N,Int}
pad::NTuple{M,Int}
dilation::NTuple{N,Int}
end
function Conv(w::AbstractArray{T,N}, b::AbstractVector{T}, σ = identity;
stride = 1, pad = 0, dilation = 1) where {T,N}
stride = expand(Val(N-2), stride)
pad = expand(Val(2*(N-2)), pad)
dilation = expand(Val(N-2), dilation)
return Conv(σ, w, b, stride, pad, dilation)
end
Conv(k::NTuple{N,Integer}, ch::Pair{<:Integer,<:Integer}, σ = identity;
init = glorot_uniform, stride = 1, pad = 0, dilation = 1) where N =
Conv(init(k..., ch...), zeros(ch[2]), σ,
stride = stride, pad = pad, dilation = dilation)
function (c::Conv)(x::AbstractArray)
# TODO: breaks gpu broadcast :(
# ndims(x) == ndims(c.weight)-1 && return squeezebatch(c(reshape(x, size(x)..., 1)))
σ, b = c.σ, reshape(c.bias, map(_->1, c.stride)..., :, 1)
cdims = DenseConvDims(x, c.weight; stride=c.stride, padding=c.pad, dilation=c.dilation)
σ.(conv(x, c.weight, cdims) .+ b)
end
@functor Conv
https://github.com/FluxML/Flux.jl/blob/master/src/optimise/train.jl
just calls Zygote `gradient`.
https://github.com/FluxML/Zygote.jl/blob/master/src/lib/nnlib.jl
@adjoint conv(x, w, cdims; kw...) =
conv(x, w, cdims; kw...),
Δ -> begin
return (
NNlib.∇conv_data(Δ, w, cdims; kw...),
NNlib.∇conv_filter(x, Δ, cdims; kw...),
nothing,
)
end
@adjoint ∇conv_data(x, w, cdims; kw...) =
∇conv_data(x, w, cdims; kw...),
Δ -> begin
return (
NNlib.conv(Δ, w, cdims; kw...),
NNlib.∇conv_filter(Δ, x, cdims; kw...),
nothing,
)
end
@adjoint depthwiseconv(x, w, cdims; kw...) =
depthwiseconv(x, w, cdims; kw...),
Δ -> begin
return (
NNlib.∇depthwiseconv_data(Δ, w, cdims; kw...),
NNlib.∇depthwiseconv_filter(x, Δ, cdims; kw...),
nothing,
)
end
Caveat Emptor
Zygote is in an early stage and may break, but issue reports and beta testing are welcome. In particular Zygote does not yet have comprehensive gradient definitions and may fail if it hits complex code in Base Julia.
Zygote’s runtime performance should generally be good, but compile times are not optimised, so calling gradient the first time can have noticeable lag. BenchmarkTools is recommended to avoid measuring JIT time.
A current limitation is that Zygote will not automatically see redefined functions (for example if you call gradient(f, x), then redefine f, then take the gradient again). You can call Zygote.refresh() to completely reset what Zygote sees. It’s often useful to have this in your script/notebook after function definitions.
The Julia compiler does not yet support all features needed to make Zygote fast, particularly in the presence of control flow. Until these are officially supported Zygote contains a flag to enable faster operation. If you can handle the additional caveats it’s a good way to see Zygote’s peak performance.
Zygote docs: https://fluxml.ai/Zygote.jl/dev/ https://fluxml.ai/Zygote.jl/dev/adjoints/ https://fluxml.ai/Zygote.jl/dev/utils/
https://github.com/timholy/ProfileView.jl
https://fluxml.ai/Zygote.jl/dev/profiling/
https://github.com/FluxML/ZygoteRules.jl
Zygote has many adjoints for non-mathematical operations such as for indexing and data structures. Though these can still be seen as linear functions of vectors, it’s not particularly enlightening to implement them with an actual matrix multiply. In these cases it’s easiest to think of the adjoint as a kind of inverse. For example, the gradient of a function that takes a tuple to a struct (e.g. y = Complex(a, b)) will generally take a struct to a tuple ((ȳ.re, ȳ.im)). The gradient of a getindex y = x[i…] is a setindex! x̄[i…] = ȳ, etc.
Julia has built-in BitArray: https://github.com/JuliaLang/julia/blob/master/base/bitarray.jl
https://github.com/JuliaGPU/CuArrays.jl/blob/master/src/array.jl
https://docs.julialang.org/en/v1/base/simd-types/index.html
Type VecElement{T} is intended for building libraries of SIMD operations. Practical use of it requires using llvmcall. The type is defined as:
struct VecElement{T} value::T end
It has a special compilation rule: a homogeneous tuple of VecElement{T} maps to an LLVM vector type when T is a primitive bits type and the tuple length is in the set {2-6,8-10,16}.
At -O3, the compiler might automatically vectorize operations on such tuples. For example, the following program, when compiled with julia -O3 generates two SIMD addition instructions (addps) on x86 systems:
const m128 = NTuple{4,VecElement{Float32}}
function add(a::m128, b::m128) (VecElement(a[1].value+b[1].value), VecElement(a[2].value+b[2].value), VecElement(a[3].value+b[3].value), VecElement(a[4].value+b[4].value)) end
triple(c::m128) = add(add(c,c),c)
code_native(triple,(m128,))
However, since the automatic vectorization cannot be relied upon, future use will mostly be via libraries that use llvmcall.
https://docs.julialang.org/en/v1/base/c/#Core.Intrinsics.llvmcall
https://kristofferc.github.io/post/intrinsics/
https://juliagpu.gitlab.io/CUDAnative.jl/
http://mikeinnes.github.io/2017/08/24/cudanative.html
vote: https://juliagpu.gitlab.io/CUDAnative.jl/lib/device/cuda/#Warp-Vote-1 https://github.com/JuliaGPU/CUDAnative.jl/blob/master/src/device/cuda/warp_vote.jl
other stuff:
https://github.com/JuliaGPU/CUDAnative.jl/tree/master/src/device/cuda
intrinsic calls made through @wrap
macro:
https://github.com/JuliaGPU/CUDAnative.jl/blob/master/src/device/tools.jl#L44
what about cuda intrinsics in general? https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#mathematical-functions-appendix
note: julia will call these.
what’s the difference between functions and intrinsics? https://stackoverflow.com/questions/24085833/cuda-math-api-difference-between-functions-and-intrinsics
The answer lies in Appendix D of the programming guide. The intrinsics for the transcendental, trigonometric, and special functions are faster, but have more domain restrictions and generally lower accuracy than their software counterparts. For the primary purpose of the hardware (ie graphics), having fast approximate functions for sin, cos, square root, reciprocal, etc. allows for improved shader performance when ultimate mathematical accuracy is not critical. For some compute tasks, the less accurate versions are also fine. For other applications, the intrinsics may not be sufficient.
https://juliagpu.gitlab.io/CUDAnative.jl/lib/device/cuda/#Shared-Memory-1
why use it? http://tdesell.cs.und.edu/lectures/cuda_3.pdf “If we can get these threads to collaborate in loading their global memory into shared memory, we could speed up the global memory access time by 4x (as each thread would load 1/4th the global memory in parallel with the rest). In general, if our blocks are sized N x N, we can get an Nx reduction of global memory traffic.”
https://juliagpu.gitlab.io/CuArrays.jl/tutorials/generated/intro/ https://juliagpu.gitlab.io/CuArrays.jl/tutorials/generated/intro/ CUDAdrv.@profile bench_gpu1!(y_d, x_d)
grid x: int32 (2^31-1) grid y: int16 (65535) grid z: int16 (65535)
block x: 1024 block y: 1024 block z: 64
threads per block (x * y * z): 512 / 1024
warp size: 32
https://devtalk.nvidia.com/default/topic/498823/warp-layout-in-a-2d-thread-block-/
According to the programming guide, it goes by x_index first, then y_index, then z_index. For the purposes of warp grouping threads don’t have 3 dimensional indices, they just go by 1. This index is given by threadId = threadIdx.x+blockDim.x*(threadIdx.y+blockDim.y*threadIdx.z). Every 32 threads of this index is a new warp.
https://docs.julialang.org/en/v1/stdlib/Test/index.html
https://discourse.julialang.org/t/whats-the-status-of-image-convolutions-on-cpu-gpu/6093 https://www.reddit.com/r/Julia/comments/e2ophj/what_is_the_state_of_gpu_convolution/ https://juliaimages.org/latest/ https://github.com/FluxML/NNlib.jl https://github.com/JuliaGPU/CuArrays.jl https://github.com/JuliaGPU/CUDAnative.jl https://juliagpu.gitlab.io/CuArrays.jl/tutorials/generated/intro/ https://github.com/JuliaGPU/GPUArrays.jl
https://docs.julialang.org/en/v1/manual/metaprogramming/index.html
yay cuda-toolkit /opt/cuda/bin/nvvp
https://devblogs.nvidia.com/cuda-pro-tip-nvprof-your-handy-universal-gpu-profiler/ https://www.parallel-computing.pro/index.php/9-cuda/54-remote-profiling-with-nvidia-visual-profiler-on-a-slurm-based-cluster
nvprof –analysis-metrics -o quantprof1.nvprof julia quantprof1.jl nvprof -f -o quantprof1.nvprof julia quantprof1.jl nvprof –analysis-metrics -o nbody-analysis.nvprof FILE –benchmark -numdevices=2 -i=1
nvprof –profile-from-start off -f -o quantprof2.nvvp –analysis-metrics –unified-memory-profiling off –replay-mode disabled ../julia-1.3.0/bin/julia quantprof2.jl
nvprof –profile-from-start off -f –export-profile quantprof2.nvvp –unified-memory-profiling off –replay-mode disabled -a global_access,shared_access,branch,instruction_execution,pc_sampling ../julia-1.3.0/bin/julia quantprof2.jl
global_access: global access shared_access: shared access branch: divergent branch instruction_execution: instruction execution pc_sampling: pc sampling, available only for GM20X+
in expression starting at /data/scratch/jhgilles/6.338/quantprof2.jl:222 unknown function (ip: 0x7f34c5d9bacc) unknown function (ip: 0x7f34c5d9d25e) unknown function (ip: 0x7f34c5e096ef) unknown function (ip: 0x7f34c5e03709) cuLaunchKernel at usr/lib/x86_64-linux-gnu/libcuda.so (unknown line) macro expansion at /data/scratch/jhgilles.julia/packages/CUDAapi/CCgJL/src/call.jl:40 [inlined] macro expansion at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/error.jl:121 [inlined] cuLaunchKernel at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/libcuda.jl:966 unknown function (ip: 0x7f34c81081e4) _jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2141 [inlined] jl_apply_generic at buildworker/worker/package_linux64/build/src/gf.c:2305 #350 at /data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/execution.jl:97 macro expansion at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/execution.jl:63 [inlined] macro expansion at ./gcutils.jl:91 [inlined] macro expansion at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/execution.jl:61 [inlined] pack_arguments at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/execution.jl:40 unknown function (ip: 0x7f34c810784d)
Nsight Systems: │ │ For improved usability, launch Julia under the Nsight Systems profiler: │ $ nsys launch -t cuda,cublas,cudnn,nvtx julia
local: /opt/cuda/nsight-compute-2019.5.0/nv-nsight-cu
remote: . julia-setup.sh https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html
the issue: mismatched cuda versions.
i’ma install a raw julia install and see if that works …yup
https://docs.nvidia.com/nsight-compute/NsightCompute/index.html#profiler-report
. julia-setup.sh cd 6.338 /usr/local/cuda/NsightCompute-1.0/nv-nsight-cu-cli -o profile –section InstructionStats –section MemoryWorkloadAnalysis –section SchedulerStats –section SourceCounters –section SpeedOfLight –section WarpStateStats –section ComputeWorkloadAnalysis –section LaunchStats –section Occupancy -c 4 -f ../julia-1.3.0/bin/julia quantprof1.jl
scp [email protected]:/data/scratch/jhgilles/6.338/\*.nsight-cuprof-report ./
/opt/cuda/nsight-compute-2019.5.0/nv-nsight-cu
https://docs.nvidia.com/nsight-compute/2019.1/pdf/NsightComputeCli.pdf
https://github.com/apache/incubator-tvm/blob/master/topi/python/topi/cuda/conv2d.py https://github.com/apache/incubator-tvm/blob/master/topi/python/topi/cuda/conv2d_direct.py
https://docs.tvm.ai/tutorials/optimize/opt_conv_cuda.html https://docs.tvm.ai/tutorials/optimize/opt_conv_cuda.html#sphx-glr-tutorials-optimize-opt-conv-cuda-py
out_size = (in_size - kernel + 2*pad) // stride + 1
- each block is for some (x,y), computes (block_factor x block_factor) region of output along (out_channel, batch)
- grabs a chunk of (step x block_factor) from both inputs; along (in_channel x batch) for input, (in_channel x out_channel) for kernel
- thread tiling scheme: i don’t entirely understand it.
- splits along vthreads – instead of walking threads along (1,2,3,4), they take strided jumps to avoid memory bank conflicts.
- how many threads are there actually though?
- how many to you need to max out an SM?
- can multiple blocks execute on the same SM?
- how many registers are there per SM? -
- reductions are per-thread
https://stackoverflow.com/questions/4391162/cuda-determining-threads-per-block-blocks-per-grid
We further split the workload from a thread block to individual threads. To avoid memory bank conflict, we use virtual thread to split the area into 4 parts, and then tile into 8x8 grids. Therefore, shown in the figure below, each thread computes 4 strided grids, where size of each grid is 4 x 4.
https://stackoverflow.com/questions/3841877/what-is-a-bank-conflict-doing-cuda-opencl-programming
So if each thread in a halfwarp accesses successive 32bit values there are no bank conflicts. An exception from this rule (every thread must access its own bank) are broadcasts: If all threads access the same address, the value is only read once and broadcasted to all threads (for GT200 it has to be all threads in the halfwarp accessing the same address, iirc fermi and AMD gpus can do this for any number of threads accessing the same value).
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#performance-guidelines
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#simt-architecture
> The term warp originates from weaving, the first parallel thread technology.
Performance optimization revolves around three basic strategies:
- Maximize parallel execution to achieve maximum utilization;
- Optimize memory usage to achieve maximum memory throughput;
- Optimize instruction usage to achieve maximum instruction throughput.
fix whatever the bottleneck is, using the profiler.
- parallel streams
- async operations
- check occupancy
minimize low-bandwidth transfers
Global memory instructions support reading or writing words of size equal to 1, 2, 4, 8, or 16 bytes. Any access (via a variable or a pointer) to data residing in global memory compiles to a single global memory instruction if and only if the size of the data type is 1, 2, 4, 8, or 16 bytes and the data is naturally aligned (i.e., its address is a multiple of that size).
For structures, the size and alignment requirements can be enforced by the compiler using the alignment specifiers __align__(8) or __align__(16), such as …
BaseAddress + width * ty + tx
For these accesses to be fully coalesced, both the width of the thread block and the width of the array must be a multiple of the warp size.
Arrays for which it cannot determine that they are indexed with constant quantities, Large structures or arrays that would consume too much register space, Any variable if the kernel uses more registers than available (this is also known as register spilling).
A multiprocessor consists of:
- 128 (6.1 and 6.2) CUDA cores for arithmetic operations,
- 32 (6.1 and 6.2) special function units for single-precision floating-point transcendental functions,
- 4 (6.1 and 6.2) warp schedulers.
When a multiprocessor is given warps to execute, it first distributes them among its schedulers. Then, at every instruction issue time, each scheduler issues one instruction for one of its assigned warps that is ready to execute, if any.
A multiprocessor has:
- a read-only constant cache that is shared by all functional units and speeds up reads from the constant memory space, which resides in device memory,
- a unified L1/texture cache for reads from global memory of size 24 KB (6.0 and 6.2) or 48 KB (6.1),
- a shared memory of size 64 KB (6.0 and 6.2) or 96 KB (6.1).
The unified L1/texture cache is also used by the texture unit that implements the various addressing modes and data filtering mentioned in Texture and Surface Memory.
Shared memory has 32 banks that are organized such that successive 32-bit words map to successive banks. Each bank has a bandwidth of 32 bits per clock cycle.
A shared memory request for a warp does not generate a bank conflict between two threads that access any address within the same 32-bit word (even though the two addresses fall in the same bank): In that case, for read accesses, the word is broadcast to the requesting threads and for write accesses, each address is written by only one of the threads (which thread performs the write is undefined).
presumably there’s some lane-shuffle thing we can do.
answer: use a reduction via OR or an SIMD intrinsic.
on gpu: we have access to warp vote to 32 bits, we should use that, then reduce to 64 bits. https://stackoverflow.com/questions/39488441/how-to-pack-bits-efficiently-in-cuda
constant static LUTs for values + binary search
on gpu: that + simple cuda code for indices
$A = [ aij ]$,
https://en.wikipedia.org/wiki/Parry_Moon#Holors (neat, but nobody uses it)
http://www.iaeng.org/publication/WCE2010/WCE2010_pp1829-1833.pdf
https://github.com/FluxML/model-zoo/
conda install cudnn
FluxML/Flux.jl#947 FluxML/Flux.jl#817
https://github.com/FluxML/Metalhead.jl
idea: define data structure QuantArray / CuQuantArray which holds a packed, quantized array + its scalar constants the goal: never write floats to global memory.
“neural networks are cool! there’s probably one racially profiling you as we speak.”
- idea: packing chunks of arrays into tiles? would that help at all?
We’ll be memory-limited, so any extra computation we can do to make the outputs more friendly to read would be good. I wonder if we can fuse multiple convolutions in-warp or in-block so that we don’t need to write anything but outputs to global memory.
https://github.com/JuliaPolyhedra/Polyhedra.jl https://inf.ethz.ch/personal/fukudak/polyfaq/polyfaq.html http://isl.gforge.inria.fr//user.html
a lO cite:QuantFriendlyMobileNet
https://stanford.edu/class/ee103/julia_slides/julia_least_squares_slides.pdf https://stackoverflow.com/questions/22399794/qr-decomposition-to-solve-linear-systems-in-cuda\ doesn’t work in cuda
- use cpu?
- gels batched? https://github.com/JuliaGPU/CuArrays.jl/blob/cccb7b507501c1c8c7933cae8388170985f2fbb9/src/blas/wrappers.jl#L1674
just use cpu.
relu before activations. duhhh just don’t use them.https://github.com/FluxML/NNlib.jl/blob/136aa8285d3f00375c65eef90f3559af6f70c69a/src/dim_helpers/DenseConvDims.jl https://github.com/FluxML/NNlib.jl/blob/master/src/conv.jl
no, apparently :/ and CuArrays doesn’t support strides…
let’s just transpose during the binarization op? arrange it so that all the channels for a region get brought in
https://carlthome.github.io/posts/nhwc-vs.-nchw%20/ https://jhui.github.io/2017/03/07/TensorFlow-Perforamnce-and-advance-topics/NCHW is slightly faster on GPU
no option for NHWC in julia …just account for it in the kernel :)
https://docs.julialang.org/en/v1/devdocs/reflection/index.html
weird....
no need: https://stackoverflow.com/questions/11888772/when-to-call-cudadevicesynchronizehave intermediate op as parameter (w/ constants for vals?) or just impl batchnorm manually
kernel size: PHWOMN = 1 * 3 * 3 * 64 * 3 * 3 * 8 bytes bytes ~= 41kb
hm, that’s pretty big. 40 8-byte reads per block member.
could we use cuda constant memory? https://cuda-programming.blogspot.com/2013/01/what-is-constant-memory-in-cuda.html
no: reads from alternate locations are serialized.
just add another dimension lolsegfault in cudaapi on replay (maybe the GC?)
in expression starting at /data/scratch/jhgilles/6.338/quantprof2.jl:222 unknown function (ip: 0x7f34c5d9bacc) unknown function (ip: 0x7f34c5d9d25e) unknown function (ip: 0x7f34c5e096ef) unknown function (ip: 0x7f34c5e03709) cuLaunchKernel at usr/lib/x86_64-linux-gnu/libcuda.so (unknown line) macro expansion at /data/scratch/jhgilles.julia/packages/CUDAapi/CCgJL/src/call.jl:40 [inlined] macro expansion at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/error.jl:121 [inlined] cuLaunchKernel at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/libcuda.jl:966 unknown function (ip: 0x7f34c81081e4) _jl_invoke at /buildworker/worker/package_linux64/build/src/gf.c:2141 [inlined] jl_apply_generic at buildworker/worker/package_linux64/build/src/gf.c:2305 #350 at /data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/execution.jl:97 macro expansion at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/execution.jl:63 [inlined] macro expansion at ./gcutils.jl:91 [inlined] macro expansion at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/execution.jl:61 [inlined] pack_arguments at data/scratch/jhgilles.julia/packages/CUDAdrv/3EzC1/src/execution.jl:40 unknown function (ip: 0x7f34c810784d)
https://github.com/JuliaGPU/CUDAapi.jl
jhgilles@vcuda-0:/data/scratch/jhgilles/6.338$ nvprof –profile-from-start off -f -o quantprof2.nvvp –analysis-metrics –unified-memory-profiling off –replay-mode disabled ../julia-1.3.0/bin/julia quantprof2.jl
it’s just `popc`1.169 ms vs 936.431 μs …oh it’s just because my input is small (x,y); that’s a significant amount more compute
https://github.com/JuliaGPU/CUDAnative.jl/blob/5dbdc8961abbdefc5d61efc3b84a90c47c6cd811/src/device/cuda/memory_shared.jl > Note that the amount of dynamic shared memory needs to specified when launching the kernel. that’s IMPORTRANT TJKLALSFHSKJLGFAH:DGSHADGFSGFXFJfj
need `shmem` in @cuda https://github.com/JuliaGPU/CUDAnative.jl/blob/b55dee83f0cf4e51c937e449cb30f5e0101c846e/src/execution.jl
to avoid bank conflicts.
abstract arguments don’t slow things. (containers of abstract types are boxed though.)
supertype() and subtypes() exist
every type has 1 supertype