Skip to content

Commit

Permalink
add modelnet to data, NS cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
maskomic committed Oct 10, 2023
1 parent 37a0ae2 commit 3af5e9c
Show file tree
Hide file tree
Showing 6 changed files with 134 additions and 119 deletions.
1 change: 1 addition & 0 deletions src/GroupAD.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ using HDF5
export GenerativeModels

include("data.jl")
include("modelnet.jl")
include("toy.jl")
include("experiments.jl")
include("experimental_loops.jl")
Expand Down
2 changes: 2 additions & 0 deletions src/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,8 @@ function load_data(dataset::String, ratios=(0.6,0.2,0.2); seed=nothing, method =
data_normal, data_anomalous = load_lhco(dataset; kwargs...)
elseif dataset in mvtec_datasets
data_normal, data_anomalous = load_mvtec(dataset; kwargs...)
elseif dataset == "modelnet"
return load_modelnet(;kwargs...)
else
data_normal, data_anomalous = load_mill_data(dataset; kwargs...)
end
Expand Down
92 changes: 7 additions & 85 deletions src/experimental_loops.jl
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,19 @@ function point_cloud_experimental_loop(sample_params_f, fit_f, edit_params_f,
data = GroupAD.leave_one_in(data; seed=seed)
elseif method == "leave-one-out"
data = GroupAD.leave_one_out(data; seed=seed)
elseif dataset == "modelnet"
nothing
else
error("This model can only run on point cloud datasets!")
end

# define where data is going to be saved
# _savepath = joinpath(savepath, "$(modelname)/$(dataset)/$(method)/class_index=$(class)/seed=$(seed)")
_savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)")
if dataset == "modelnet"
_savepath = joinpath(savepath, "$(modelname)/$(method)/seed=$(seed)")
else
_savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)")
end
mkpath(_savepath)

# edit parameters
Expand Down Expand Up @@ -236,90 +242,6 @@ function point_cloud_experimental_loop(sample_params_f, fit_f, edit_params_f,
(try_counter == max_tries) ? (@info "Reached $(max_tries) tries, giving up.") : nothing
end

"""
point_cloud_experimental_loop(sample_params_f, fit_f, edit_params_f,
max_seed, modelname, dataset, contamination, savepath, anomaly_classes, method)
This function takes a function that samples parameters, a fit function and a function that edits the sampled
parameters and other parameters. Then it loads data, samples hyperparameters, calls the fit function
that is supposed to construct and fit a model and finally evaluates the returned score functions on
the loaded data.
This function works for point cloud datasets. Differentiation between leave-one-in and leave-one-out
setting is done via parameter `method`.
This particular loop only loops over 1:max_seed with given anomaly class.
"""
function point_cloud_experimental_loop2(sample_params_f, fit_f, edit_params_f,
max_seed, modelname, dataset, contamination, savepath, anomaly_class, method)
# sample the random hyperparameters
parameters = sample_params_f()

# for the given number of seeds run the experiment
for seed in 1:max_seed
# with these hyperparameters, train and evaluate the model on different train/val/tst splits
# load data for either "MNIST_in" or "MNIST_out" and set the setting
# prepared for other point cloud datasets such as ModelNet10

# load data
data = load_data(dataset, anomaly_class_ind=anomaly_class, seed=seed, method=method, contamination=contamination)

# undersample data for leave-one-in and leave-one-out methods
if method == "leave-one-in"
data = GroupAD.leave_one_in(data; seed=seed)
elseif method == "leave-one-out"
data = GroupAD.leave_one_out(data; seed=seed)
else
error("This model can only run on point cloud datasets!")
end

# define where data is going to be saved
_savepath = joinpath(savepath, "$(modelname)/$(dataset)/$(method)/class_index=$(class)/seed=$(seed)")
mkpath(_savepath)

# edit parameters
edited_parameters = edit_params_f(data, parameters, anomaly_class, method)

@info "Trying to fit $modelname on $(dataset) in $method setting.\nModel parameters: $(edited_parameters)..."
@info "Train/validation/test splits: $(size(data[1][1], 2)) | $(size(data[2][1], 2)) | $(size(data[3][1], 2))"
@info "Number of features: $(size(data[1][1], 1))"

@info "Params check done. Trying to fit."
# fit
training_info, results = fit_f(data, edited_parameters)

# save the model separately
if training_info.model != nothing
modelf = joinpath(_savepath, savename("model", edited_parameters, "bson", digits=5))
tagsave(
modelf,
Dict("model"=>training_info.model,
"fit_t"=>training_info.fit_t,
"history"=>training_info.history,
"parameters"=>edited_parameters
),
safe = true)
(@info "Model saved to $modelf")

training_info = merge(training_info, (model = nothing, history=nothing))
end

# here define what additional info should be saved together with parameters, scores, labels and predict times
save_entries = merge(training_info, (modelname = modelname, seed = seed, dataset = dataset))

# now loop over all anomaly score funs
@time for result in results
if modelname in ["vae_instance", "statistician", "PoolModel"]
@info "Trying to save results..."
experiment_bag(result..., data, _savepath; save_entries...)
else
experiment(result..., data, _savepath; save_entries...)
end
end
end
end


"""
toy_experimental_loop_toy(sample_params_f, fit_f, edit_params_f,
max_seed, modelname, dataset, contamination, savepath)
Expand Down
84 changes: 84 additions & 0 deletions src/modelnet.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
using HDF5
using Random
using StatsBase
using Mill

function tensors_to_mill(data)
n = size(data, 2)
lengths = repeat([n], size(data, 3))
idxes = Mill.length2bags(lengths)
flatten = reshape(data, 3, n * size(data, 3))
return BagNode(ArrayNode(flatten), idxes)
end

function load_modelnet(npoints=2048; method="chair", validation::Bool=true, ratio=0.2, seed::Int=666, kwargs...)
# method is actually the class that we want out,
# because this is the only way not to change too many lines
(X_train, Y_train), (X_val, Y_val), (X_test, Y_test) = _load_modelnet10(npoints, method; validation=validation, ratio=ratio, seed=seed)
X_train = tensors_to_mill(X_train)
X_val = tensors_to_mill(X_val)
X_test = tensors_to_mill(X_test)
return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)
end

function train_test_split(X, y, ratio=0.2; seed=nothing)
# simple util function
(seed!==nothing) ? Random.seed!(seed) : nothing

N = size(X,3)
idx_samples = sample(1:N, Int(floor(N*ratio)), replace=false)
idx_bool = zeros(Bool,N)
idx_bool[idx_samples] .= true

X_val = X[:,:,idx_bool]
Y_val = y[idx_bool]
X_train = X[:,:,.!idx_bool]
Y_train = y[.!idx_bool]

(seed!==nothing) ? Random.seed!() : nothing
return (X_train, Y_train), (X_val, Y_val)
end


function _load_modelnet10(npoints=2048, type="all"; validation::Bool=true, ratio=0.2, seed::Int=666)
"""
npoints ... Number of points per object ( 512 / 1024 / 2048 )
type ... Type data -> \"all\" or one-class name e.g. \"chair\", \"monitor\"
validation ... Return validation set (\"true\") or not (\"false\")
seed ... Random seed for validation split.
"""

#load data
data = HDF5.h5open("/home/maskomic/projects/GroupAD.jl/data/modelnet10_$(npoints).h5")
X_train, X_test, Y_train, Y_test = data["X_train"]|>read, data["X_test"]|>read, data["Y_train"]|>read, data["Y_test"]|>read

titles = ["bathtub", "bed", "chair", "desk", "dresser", "monitor", "night_stand", "sofa", "table", "toilet"]

# if validation
# (X_train,Y_train), (X_val,Y_val) = train_test_split(X_train, Y_train, ratio, seed=seed)
# if type in titles
# idx = findmax(titles .== type)[2]
# X_train = X_train[:, :, Y_train .== idx]
# Y_train = zeros(Bool,size(Y_train[Y_train .== idx]))
# Y_val = Y_val .!= idx
# Y_test = Y_test .!= idx
# end
# data = ((X_train, Y_train), (X_val, Y_val), (X_test, Y_test))
# else
# if type in titles
# idx = findmax(titles .== type)[2]
# X_train = X_train[:, :, Y_train .== idx]
# Y_train = zeros(Bool,size(Y_train[Y_train .== idx]))
# Y_test = Y_test .!= idx
# end
# data = ((X_train, Y_train), (X_test, Y_test))
# end
(X_train,Y_train), (X_val,Y_val) = train_test_split(X_train, Y_train, ratio, seed=seed)
idx = findmax(titles .== type)[2]
X_train = X_train[:, :, Y_train .== idx]
Y_train = zeros(Bool,size(Y_train[Y_train .== idx]))
Y_val = Y_val .!= idx
Y_test = Y_test .!= idx
data = ((X_train, Y_train), (X_val, Y_val), (X_test, Y_test))
return data
end
40 changes: 15 additions & 25 deletions src/models/PoolModel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,28 +23,30 @@ transformation of all instances, the latter only transforms the one-vector summa
The summary is created with a pooling function which has to be permutation invariant.
Possible functions include: mean, sum, maximum, etc.
"""
struct PoolModel{pre <: Chain, post <: Chain, fun <: Function, p <: ContinuousMultivariateDistribution, e <: ConditionalMvNormal, g <: ConditionalMvNormal, d <: Chain}
struct PoolModel{pre <: Chain, post <: Chain, fun <: Function, g <: ConditionalMvNormal, d <: Chain}
prepool_net::pre
postpool_net::post
poolf::fun
prior::p
encoder::e
generator::g
decoder::d
end

Flux.@functor PoolModel

function Flux.trainable(m::PoolModel)
(prepool_net = m.prepool_net, postpool_net = m.postpool_net, encoder = m.encoder, generator = m.generator, decoder = m.decoder)
(prepool_net = m.prepool_net, postpool_net = m.postpool_net, generator = m.generator, decoder = m.decoder)
end

function PoolModel(pre, post, fun, gen, dec, enc::ConditionalMvNormal, plength::Int)
W = first(Flux.params(enc))
μ = fill!(similar(W, plength), 0)
σ = fill!(similar(W, plength), 1)
prior = DistributionsAD.TuringMvNormal(μ, σ)
PoolModel(pre, post, fun, prior, enc, gen, dec)
# function PoolModel(pre, post, fun, gen, dec)
# PoolModel(pre, post, fun, gen, dec)
# end

function (m::PoolModel)(x)
v = m.prepool_net(x)
p = m.poolf(v)
p_post = m.postpool_net(p)
z = hcat([rand(m.generator, p_post) for i in 1:size(x, 2)]...)
m.decoder(z)
end

function Base.show(io::IO, pm::PoolModel)
Expand Down Expand Up @@ -88,25 +90,12 @@ function pm_constructor(;idim, hdim, predim, postdim, edim, activation="swish",
)

if var == "scalar"
# encoder
enc = Chain(
build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)...,
SplitLayer(hdim,[edim,1])
)
enc_dist = ConditionalMvNormal(enc)

gen = Chain(
build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)...,
SplitLayer(hdim,[edim,1])
)
gen_dist = ConditionalMvNormal(gen)
else
enc = Chain(
build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)...,
SplitLayer(hdim,[edim,edim])
)
enc_dist = ConditionalMvNormal(enc)

gen = Chain(
build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)...,
SplitLayer(hdim,[edim,edim])
Expand All @@ -119,7 +108,7 @@ function pm_constructor(;idim, hdim, predim, postdim, edim, activation="swish",
Dense(hdim,idim)
)

pm = PoolModel(pre, post, fun, gen_dist, dec, enc_dist, edim)
pm = PoolModel(pre, post, fun, gen_dist, dec)
return pm
end

Expand Down Expand Up @@ -229,7 +218,8 @@ function StatsBase.fit!(model::PoolModel, data::Tuple, loss::Function;
for batch in RandomBatches(tr_x, 10)
# classic training
bag_batch = RandomBagBatches(tr_x,batchsize=batchsize,randomize=true)
Flux.train!(lossf, ps, [bag_batch], opt)
# Flux.train!(lossf, ps, [bag_batch], opt)
Flux.train!(lossf, ps, bag_batch, opt)
# only batch training loss
batch_loss = lossf(bag_batch) # mean(lossf.(bag_batch))

Expand Down
34 changes: 25 additions & 9 deletions src/models/hmil.jl
Original file line number Diff line number Diff line change
Expand Up @@ -159,17 +159,17 @@ function fit_hmil!(model::Chain, data::Tuple, loss::Function;
ps = Flux.params(tr_model)
_patience = patience

best_val_loss = Inf
i = 1
start_time = time()

# prepare data
tr, vl = classification_data(data, na, seed)
train_data, train_labels = tr[1], Flux.onehotbatch(tr[2], [0,1])
val_data, val_labels = vl[1], Flux.onehotbatch(vl[2], [0,1])

lossf(x, y) = loss(tr_model, x, y)

best_val_loss = Inf
i = 1
start_time = time()

# infinite for loop via RandomBatches
for batch in RandomBatches(train_data, 1)
# classic training
Expand All @@ -185,8 +185,13 @@ function fit_hmil!(model::Chain, data::Tuple, loss::Function;
# validation/early stopping
# val_loss = lossf(bag_batch) # mean(lossf.(val_x))
val_loss = lossf(val_data, val_labels)
train_loss = lossf(train_data, train_labels)

acc = mean(Flux.onecold(tr_model(train_data), [0,1]) .== Flux.onecold(train_labels, [0,1]))

@info "$i - loss: $(batch_loss) (batch) | $(val_loss) (validation)"
@info "train loss: $train_loss"
@info "train accuracy: $acc"

if isnan(val_loss) || isnan(batch_loss)
error("Encountered invalid values in loss function.")
Expand Down Expand Up @@ -227,7 +232,7 @@ function hmil_basic_loop(sample_params_f, fit_f, edit_params_f,
# sample the random hyperparameters
parameters = sample_params_f()

for na in [100,10,20]
for na in [100,5,10,20]
# for na in 0
# with these hyperparameters, train and evaluate the model on different train/val/tst splits
for seed in 1:max_seed
Expand Down Expand Up @@ -282,11 +287,11 @@ function hmil_pc_loop(sample_params_f, fit_f, edit_params_f,
# sample the random hyperparameters
parameters = sample_params_f()

for na in [100,10,20]
Threads.@threads for na in [10, 20, 100]
# run over all classes with the same hyperparameters
# use more CPU cores for calculation
@info "Starting parallel process on $(Threads.nthreads()) cores (over $max_seed seeds)."
Threads.@threads for seed in 1:max_seed
for seed in 1:max_seed
# with these hyperparameters, train and evaluate the model on different train/val/tst splits
# load data for either "MNIST_in" or "MNIST_out" and set the setting
# prepared for other point cloud datasets such as ModelNet10
Expand All @@ -296,13 +301,24 @@ function hmil_pc_loop(sample_params_f, fit_f, edit_params_f,
data = GroupAD.leave_one_in(data; seed=seed)
elseif method == "leave-one-out"
data = GroupAD.leave_one_out(data; seed=seed)
elseif dataset == "modelnet"
# data = (
# (data[1][1], .!data[1][2]),
# (data[2][1], .!data[2][2]),
# (data[3][1], .!data[3][2])
# )
nothing
else
error("This model can only run on point cloud datasets!")
end

# define where data is going to be saved
# _savepath = joinpath(savepath, "$(modelname)/$(dataset)/$(method)/class_index=$(class)/seed=$(seed)")
_savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)")
if dataset == "modelnet"
_savepath = joinpath(savepath, "$(modelname)/$(method)/seed=$(seed)")
else
_savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)")
end
mkpath(_savepath)

# edit parameters
Expand Down Expand Up @@ -349,4 +365,4 @@ end

function get_label_hmil(model, x)
Flux.onecold(model(x), [0,1])
end
end

0 comments on commit 3af5e9c

Please sign in to comment.