From 3af5e9ca1f217d16f20edd336f944f07dd01bfc7 Mon Sep 17 00:00:00 2001 From: maskomic Date: Tue, 10 Oct 2023 07:36:47 +0200 Subject: [PATCH] add modelnet to data, NS cleanup --- src/GroupAD.jl | 1 + src/data.jl | 2 + src/experimental_loops.jl | 92 +++------------------------------------ src/modelnet.jl | 84 +++++++++++++++++++++++++++++++++++ src/models/PoolModel.jl | 40 +++++++---------- src/models/hmil.jl | 34 +++++++++++---- 6 files changed, 134 insertions(+), 119 deletions(-) create mode 100644 src/modelnet.jl diff --git a/src/GroupAD.jl b/src/GroupAD.jl index 7bba09e..59646fd 100755 --- a/src/GroupAD.jl +++ b/src/GroupAD.jl @@ -19,6 +19,7 @@ using HDF5 export GenerativeModels include("data.jl") +include("modelnet.jl") include("toy.jl") include("experiments.jl") include("experimental_loops.jl") diff --git a/src/data.jl b/src/data.jl index 359c67f..45d61cd 100644 --- a/src/data.jl +++ b/src/data.jl @@ -543,6 +543,8 @@ function load_data(dataset::String, ratios=(0.6,0.2,0.2); seed=nothing, method = data_normal, data_anomalous = load_lhco(dataset; kwargs...) elseif dataset in mvtec_datasets data_normal, data_anomalous = load_mvtec(dataset; kwargs...) + elseif dataset == "modelnet" + return load_modelnet(;kwargs...) else data_normal, data_anomalous = load_mill_data(dataset; kwargs...) end diff --git a/src/experimental_loops.jl b/src/experimental_loops.jl index eafa881..05179ed 100755 --- a/src/experimental_loops.jl +++ b/src/experimental_loops.jl @@ -174,13 +174,19 @@ function point_cloud_experimental_loop(sample_params_f, fit_f, edit_params_f, data = GroupAD.leave_one_in(data; seed=seed) elseif method == "leave-one-out" data = GroupAD.leave_one_out(data; seed=seed) + elseif dataset == "modelnet" + nothing else error("This model can only run on point cloud datasets!") end # define where data is going to be saved # _savepath = joinpath(savepath, "$(modelname)/$(dataset)/$(method)/class_index=$(class)/seed=$(seed)") - _savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)") + if dataset == "modelnet" + _savepath = joinpath(savepath, "$(modelname)/$(method)/seed=$(seed)") + else + _savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)") + end mkpath(_savepath) # edit parameters @@ -236,90 +242,6 @@ function point_cloud_experimental_loop(sample_params_f, fit_f, edit_params_f, (try_counter == max_tries) ? (@info "Reached $(max_tries) tries, giving up.") : nothing end -""" - point_cloud_experimental_loop(sample_params_f, fit_f, edit_params_f, - max_seed, modelname, dataset, contamination, savepath, anomaly_classes, method) - -This function takes a function that samples parameters, a fit function and a function that edits the sampled -parameters and other parameters. Then it loads data, samples hyperparameters, calls the fit function -that is supposed to construct and fit a model and finally evaluates the returned score functions on -the loaded data. - -This function works for point cloud datasets. Differentiation between leave-one-in and leave-one-out -setting is done via parameter `method`. - -This particular loop only loops over 1:max_seed with given anomaly class. -""" -function point_cloud_experimental_loop2(sample_params_f, fit_f, edit_params_f, - max_seed, modelname, dataset, contamination, savepath, anomaly_class, method) - # sample the random hyperparameters - parameters = sample_params_f() - - # for the given number of seeds run the experiment - for seed in 1:max_seed - # with these hyperparameters, train and evaluate the model on different train/val/tst splits - # load data for either "MNIST_in" or "MNIST_out" and set the setting - # prepared for other point cloud datasets such as ModelNet10 - - # load data - data = load_data(dataset, anomaly_class_ind=anomaly_class, seed=seed, method=method, contamination=contamination) - - # undersample data for leave-one-in and leave-one-out methods - if method == "leave-one-in" - data = GroupAD.leave_one_in(data; seed=seed) - elseif method == "leave-one-out" - data = GroupAD.leave_one_out(data; seed=seed) - else - error("This model can only run on point cloud datasets!") - end - - # define where data is going to be saved - _savepath = joinpath(savepath, "$(modelname)/$(dataset)/$(method)/class_index=$(class)/seed=$(seed)") - mkpath(_savepath) - - # edit parameters - edited_parameters = edit_params_f(data, parameters, anomaly_class, method) - - @info "Trying to fit $modelname on $(dataset) in $method setting.\nModel parameters: $(edited_parameters)..." - @info "Train/validation/test splits: $(size(data[1][1], 2)) | $(size(data[2][1], 2)) | $(size(data[3][1], 2))" - @info "Number of features: $(size(data[1][1], 1))" - - @info "Params check done. Trying to fit." - # fit - training_info, results = fit_f(data, edited_parameters) - - # save the model separately - if training_info.model != nothing - modelf = joinpath(_savepath, savename("model", edited_parameters, "bson", digits=5)) - tagsave( - modelf, - Dict("model"=>training_info.model, - "fit_t"=>training_info.fit_t, - "history"=>training_info.history, - "parameters"=>edited_parameters - ), - safe = true) - (@info "Model saved to $modelf") - - training_info = merge(training_info, (model = nothing, history=nothing)) - end - - # here define what additional info should be saved together with parameters, scores, labels and predict times - save_entries = merge(training_info, (modelname = modelname, seed = seed, dataset = dataset)) - - # now loop over all anomaly score funs - @time for result in results - if modelname in ["vae_instance", "statistician", "PoolModel"] - @info "Trying to save results..." - experiment_bag(result..., data, _savepath; save_entries...) - else - experiment(result..., data, _savepath; save_entries...) - end - end - end -end - - """ toy_experimental_loop_toy(sample_params_f, fit_f, edit_params_f, max_seed, modelname, dataset, contamination, savepath) diff --git a/src/modelnet.jl b/src/modelnet.jl new file mode 100644 index 0000000..4145182 --- /dev/null +++ b/src/modelnet.jl @@ -0,0 +1,84 @@ +using HDF5 +using Random +using StatsBase +using Mill + +function tensors_to_mill(data) + n = size(data, 2) + lengths = repeat([n], size(data, 3)) + idxes = Mill.length2bags(lengths) + flatten = reshape(data, 3, n * size(data, 3)) + return BagNode(ArrayNode(flatten), idxes) +end + +function load_modelnet(npoints=2048; method="chair", validation::Bool=true, ratio=0.2, seed::Int=666, kwargs...) + # method is actually the class that we want out, + # because this is the only way not to change too many lines + (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) = _load_modelnet10(npoints, method; validation=validation, ratio=ratio, seed=seed) + X_train = tensors_to_mill(X_train) + X_val = tensors_to_mill(X_val) + X_test = tensors_to_mill(X_test) + return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) +end + +function train_test_split(X, y, ratio=0.2; seed=nothing) + # simple util function + (seed!==nothing) ? Random.seed!(seed) : nothing + + N = size(X,3) + idx_samples = sample(1:N, Int(floor(N*ratio)), replace=false) + idx_bool = zeros(Bool,N) + idx_bool[idx_samples] .= true + + X_val = X[:,:,idx_bool] + Y_val = y[idx_bool] + X_train = X[:,:,.!idx_bool] + Y_train = y[.!idx_bool] + + (seed!==nothing) ? Random.seed!() : nothing + return (X_train, Y_train), (X_val, Y_val) +end + + +function _load_modelnet10(npoints=2048, type="all"; validation::Bool=true, ratio=0.2, seed::Int=666) + """ + npoints ... Number of points per object ( 512 / 1024 / 2048 ) + type ... Type data -> \"all\" or one-class name e.g. \"chair\", \"monitor\" + validation ... Return validation set (\"true\") or not (\"false\") + seed ... Random seed for validation split. + """ + + #load data + data = HDF5.h5open("/home/maskomic/projects/GroupAD.jl/data/modelnet10_$(npoints).h5") + X_train, X_test, Y_train, Y_test = data["X_train"]|>read, data["X_test"]|>read, data["Y_train"]|>read, data["Y_test"]|>read + + titles = ["bathtub", "bed", "chair", "desk", "dresser", "monitor", "night_stand", "sofa", "table", "toilet"] + + # if validation + # (X_train,Y_train), (X_val,Y_val) = train_test_split(X_train, Y_train, ratio, seed=seed) + # if type in titles + # idx = findmax(titles .== type)[2] + # X_train = X_train[:, :, Y_train .== idx] + # Y_train = zeros(Bool,size(Y_train[Y_train .== idx])) + # Y_val = Y_val .!= idx + # Y_test = Y_test .!= idx + # end + # data = ((X_train, Y_train), (X_val, Y_val), (X_test, Y_test)) + # else + # if type in titles + # idx = findmax(titles .== type)[2] + # X_train = X_train[:, :, Y_train .== idx] + # Y_train = zeros(Bool,size(Y_train[Y_train .== idx])) + # Y_test = Y_test .!= idx + # end + # data = ((X_train, Y_train), (X_test, Y_test)) + # end + (X_train,Y_train), (X_val,Y_val) = train_test_split(X_train, Y_train, ratio, seed=seed) + idx = findmax(titles .== type)[2] + X_train = X_train[:, :, Y_train .== idx] + Y_train = zeros(Bool,size(Y_train[Y_train .== idx])) + Y_val = Y_val .!= idx + Y_test = Y_test .!= idx + data = ((X_train, Y_train), (X_val, Y_val), (X_test, Y_test)) + return data +end \ No newline at end of file diff --git a/src/models/PoolModel.jl b/src/models/PoolModel.jl index 12090a6..25c5458 100644 --- a/src/models/PoolModel.jl +++ b/src/models/PoolModel.jl @@ -23,12 +23,10 @@ transformation of all instances, the latter only transforms the one-vector summa The summary is created with a pooling function which has to be permutation invariant. Possible functions include: mean, sum, maximum, etc. """ -struct PoolModel{pre <: Chain, post <: Chain, fun <: Function, p <: ContinuousMultivariateDistribution, e <: ConditionalMvNormal, g <: ConditionalMvNormal, d <: Chain} +struct PoolModel{pre <: Chain, post <: Chain, fun <: Function, g <: ConditionalMvNormal, d <: Chain} prepool_net::pre postpool_net::post poolf::fun - prior::p - encoder::e generator::g decoder::d end @@ -36,15 +34,19 @@ end Flux.@functor PoolModel function Flux.trainable(m::PoolModel) - (prepool_net = m.prepool_net, postpool_net = m.postpool_net, encoder = m.encoder, generator = m.generator, decoder = m.decoder) + (prepool_net = m.prepool_net, postpool_net = m.postpool_net, generator = m.generator, decoder = m.decoder) end -function PoolModel(pre, post, fun, gen, dec, enc::ConditionalMvNormal, plength::Int) - W = first(Flux.params(enc)) - μ = fill!(similar(W, plength), 0) - σ = fill!(similar(W, plength), 1) - prior = DistributionsAD.TuringMvNormal(μ, σ) - PoolModel(pre, post, fun, prior, enc, gen, dec) +# function PoolModel(pre, post, fun, gen, dec) +# PoolModel(pre, post, fun, gen, dec) +# end + +function (m::PoolModel)(x) + v = m.prepool_net(x) + p = m.poolf(v) + p_post = m.postpool_net(p) + z = hcat([rand(m.generator, p_post) for i in 1:size(x, 2)]...) + m.decoder(z) end function Base.show(io::IO, pm::PoolModel) @@ -88,25 +90,12 @@ function pm_constructor(;idim, hdim, predim, postdim, edim, activation="swish", ) if var == "scalar" - # encoder - enc = Chain( - build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)..., - SplitLayer(hdim,[edim,1]) - ) - enc_dist = ConditionalMvNormal(enc) - gen = Chain( build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)..., SplitLayer(hdim,[edim,1]) ) gen_dist = ConditionalMvNormal(gen) else - enc = Chain( - build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)..., - SplitLayer(hdim,[edim,edim]) - ) - enc_dist = ConditionalMvNormal(enc) - gen = Chain( build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)..., SplitLayer(hdim,[edim,edim]) @@ -119,7 +108,7 @@ function pm_constructor(;idim, hdim, predim, postdim, edim, activation="swish", Dense(hdim,idim) ) - pm = PoolModel(pre, post, fun, gen_dist, dec, enc_dist, edim) + pm = PoolModel(pre, post, fun, gen_dist, dec) return pm end @@ -229,7 +218,8 @@ function StatsBase.fit!(model::PoolModel, data::Tuple, loss::Function; for batch in RandomBatches(tr_x, 10) # classic training bag_batch = RandomBagBatches(tr_x,batchsize=batchsize,randomize=true) - Flux.train!(lossf, ps, [bag_batch], opt) + # Flux.train!(lossf, ps, [bag_batch], opt) + Flux.train!(lossf, ps, bag_batch, opt) # only batch training loss batch_loss = lossf(bag_batch) # mean(lossf.(bag_batch)) diff --git a/src/models/hmil.jl b/src/models/hmil.jl index 83f5fa1..5e619c2 100644 --- a/src/models/hmil.jl +++ b/src/models/hmil.jl @@ -159,10 +159,6 @@ function fit_hmil!(model::Chain, data::Tuple, loss::Function; ps = Flux.params(tr_model) _patience = patience - best_val_loss = Inf - i = 1 - start_time = time() - # prepare data tr, vl = classification_data(data, na, seed) train_data, train_labels = tr[1], Flux.onehotbatch(tr[2], [0,1]) @@ -170,6 +166,10 @@ function fit_hmil!(model::Chain, data::Tuple, loss::Function; lossf(x, y) = loss(tr_model, x, y) + best_val_loss = Inf + i = 1 + start_time = time() + # infinite for loop via RandomBatches for batch in RandomBatches(train_data, 1) # classic training @@ -185,8 +185,13 @@ function fit_hmil!(model::Chain, data::Tuple, loss::Function; # validation/early stopping # val_loss = lossf(bag_batch) # mean(lossf.(val_x)) val_loss = lossf(val_data, val_labels) + train_loss = lossf(train_data, train_labels) + + acc = mean(Flux.onecold(tr_model(train_data), [0,1]) .== Flux.onecold(train_labels, [0,1])) @info "$i - loss: $(batch_loss) (batch) | $(val_loss) (validation)" + @info "train loss: $train_loss" + @info "train accuracy: $acc" if isnan(val_loss) || isnan(batch_loss) error("Encountered invalid values in loss function.") @@ -227,7 +232,7 @@ function hmil_basic_loop(sample_params_f, fit_f, edit_params_f, # sample the random hyperparameters parameters = sample_params_f() - for na in [100,10,20] + for na in [100,5,10,20] # for na in 0 # with these hyperparameters, train and evaluate the model on different train/val/tst splits for seed in 1:max_seed @@ -282,11 +287,11 @@ function hmil_pc_loop(sample_params_f, fit_f, edit_params_f, # sample the random hyperparameters parameters = sample_params_f() - for na in [100,10,20] + Threads.@threads for na in [10, 20, 100] # run over all classes with the same hyperparameters # use more CPU cores for calculation @info "Starting parallel process on $(Threads.nthreads()) cores (over $max_seed seeds)." - Threads.@threads for seed in 1:max_seed + for seed in 1:max_seed # with these hyperparameters, train and evaluate the model on different train/val/tst splits # load data for either "MNIST_in" or "MNIST_out" and set the setting # prepared for other point cloud datasets such as ModelNet10 @@ -296,13 +301,24 @@ function hmil_pc_loop(sample_params_f, fit_f, edit_params_f, data = GroupAD.leave_one_in(data; seed=seed) elseif method == "leave-one-out" data = GroupAD.leave_one_out(data; seed=seed) + elseif dataset == "modelnet" + # data = ( + # (data[1][1], .!data[1][2]), + # (data[2][1], .!data[2][2]), + # (data[3][1], .!data[3][2]) + # ) + nothing else error("This model can only run on point cloud datasets!") end # define where data is going to be saved # _savepath = joinpath(savepath, "$(modelname)/$(dataset)/$(method)/class_index=$(class)/seed=$(seed)") - _savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)") + if dataset == "modelnet" + _savepath = joinpath(savepath, "$(modelname)/$(method)/seed=$(seed)") + else + _savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)") + end mkpath(_savepath) # edit parameters @@ -349,4 +365,4 @@ end function get_label_hmil(model, x) Flux.onecold(model(x), [0,1]) -end \ No newline at end of file +end