From d0d96aa769e348f06d11d8df344619f8b8710cc6 Mon Sep 17 00:00:00 2001 From: maskomic Date: Tue, 4 Oct 2022 10:48:51 +0200 Subject: [PATCH] mvtec --- Manifest.toml | 12 ++ Project.toml | 1 + README.md | 4 +- experimental/lhco_results.jl | 46 ++++++ scripts/experiments_mill/datasets_mvtech.txt | 4 + scripts/experiments_mill/knn_basic.jl | 33 +++-- scripts/experiments_mill/poolmodel.jl | 33 +++-- scripts/experiments_mill/statistician.jl | 33 +++-- scripts/experiments_mill/vae_basic.jl | 31 +++-- scripts/experiments_mill/vae_instance.jl | 31 +++-- scripts/experiments_mvtec/vae_basic.jl | 139 +++++++++++++++++++ src/GroupAD.jl | 13 +- src/data.jl | 36 +++++ 13 files changed, 365 insertions(+), 51 deletions(-) create mode 100644 scripts/experiments_mill/datasets_mvtech.txt create mode 100644 scripts/experiments_mvtec/vae_basic.jl diff --git a/Manifest.toml b/Manifest.toml index 3a730d0..2a51b4a 100755 --- a/Manifest.toml +++ b/Manifest.toml @@ -503,6 +503,18 @@ git-tree-sha1 = "53bb909d1151e57e2484c3d1b53e19552b887fb2" uuid = "42e2da0e-8278-4e71-bc24-59509adca0fe" version = "1.0.2" +[[deps.HDF5]] +deps = ["Compat", "HDF5_jll", "Libdl", "Mmap", "Random", "Requires"] +git-tree-sha1 = "899f041bf330ebeead3637073b2ca7477760edde" +uuid = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" +version = "0.16.11" + +[[deps.HDF5_jll]] +deps = ["Artifacts", "JLLWrappers", "LibCURL_jll", "Libdl", "OpenSSL_jll", "Pkg", "Zlib_jll"] +git-tree-sha1 = "c003b31e2e818bc512b0ff99d7dce03b0c1359f5" +uuid = "0234f1f7-429e-5d53-9886-15a909be8d59" +version = "1.12.2+1" + [[deps.HTTP]] deps = ["Base64", "Dates", "IniFile", "Logging", "MbedTLS", "NetworkOptions", "Sockets", "URIs"] git-tree-sha1 = "0fa77022fe4b511826b39c894c90daf5fce3334a" diff --git a/Project.toml b/Project.toml index 46f07e1..3612706 100755 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ EvalMetrics = "251d5f9e-10c1-4699-ba24-e0ad168fa3e4" FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" Flux3D = "432009dd-59a1-4b72-8c93-6462ce9b220f" +HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" LIBSVM = "b1bec4e5-fd48-53fe-b0cb-9723c09d164b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MLDataPattern = "9920b226-0b2a-5f5f-9153-9aa70a013f8b" diff --git a/README.md b/README.md index bfb4182..025896a 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,12 @@ julia GroupAD.jl/scripts/evaluate_performance_single.jl path/to/results ## Running experiments on the RCI cluster +*Note: Since LHCO dataset, Python is needed for data loading. Use Python/3.8 to install `pandas`.* + 0. First, load Julia and Python modules. ```bash ml Julia -ml Python +ml Python/3.8 ``` 1. Install the package somewhere on the RCI cluster. 2. Then the experiments can be run via `slurm`. This will run 20 experiments with the basic VAE model, each with 5 crossvalidation repetitions on all datasets in the text file with 10 parallel processes for each dataset. diff --git a/experimental/lhco_results.jl b/experimental/lhco_results.jl index 0b8240d..507b2ae 100644 --- a/experimental/lhco_results.jl +++ b/experimental/lhco_results.jl @@ -126,6 +126,16 @@ function collect_lhco(model::String, dataset="events_anomalydetection_v2.h5") return vcat(dfs...) end +function collect_mvtec(model::String, datasets=mvtec_datasets) + len = length(mvtec_datasets) + dfs = repeat([DataFrame()], len) + Threads.@threads for i in 1:len + _df = collect_results(datadir("experiments", "contamination-0.0", "mv_tec", model, datasets[i]), subfolders=true, rexclude=[r"model_.*"]) + dfs[i] = _df + end + return vcat(dfs...) +end + """ calculate_results(model::String; dataset::String="MIL", metric::Symbol=:val_AUC, show=false, tf=tf_unicode, filter_fun=nothing, max_seed=10) @@ -140,6 +150,8 @@ function calculate_results(model::String; dataset::String="MIL", metric::Symbol= df = collect_mill(model) elseif dataset == "LHCO" df = collect_lhco(model) + elseif dataset == "mvtec" + df = collect_mvtec(model) end @info "Data loaded." # filter out model files (for vae, statistician...) - not needed with the newest DrWatson's collect_results rexclude @@ -223,3 +235,37 @@ function lhco_model_results(model::String; metric::Symbol=:val_AUC, show=false, end R2, g2[1] end + +function mvtec_model_results(model::String; metric::Symbol=:val_AUC, show=false, tf=tf_unicode, filter_fun=nothing, max_seed=5) + # load results and create a grouped dataframe + g2 = calculate_results(model, dataset="mvtec", metric=metric, show=show, tf=tf, filter_fun=filter_fun, max_seed=max_seed) + # find the best model based on metric (validation AUC) + R = findmaxs(g2, metric) + + # reorder columns + c = ncol(R) + R2 = R[:, vcat([1,c-1,c], setdiff(1:c, [1,c,c-1]))] + + # create a pretty table + if show + pretty_table(R2, nosubheader=true, tf = tf) + end + + return R2, g2 +end + +function results_all_models(dataset::String; models = ["knn_basic", "vae_basic", "vae_instance", "statistician", "PoolModel"], + metric::Symbol=:val_AUC, show=false, tf=tf_unicode, filter_fun=nothing, max_seed=5) + PT = [] + + for model in models + g = calculate_results(model, dataset=dataset, metric=metric, show=show, tf=tf, filter_fun=filter_fun, max_seed=max_seed) + R = findmaxs(g, metric) + c = ncol(R) + R2 = R[:, vcat([1,c-1,c], setdiff(1:c, [1,c,c-1]))] + p = hcat(DataFrame(:modelname => model), R2) + push!(PT, p) + end + + map(x -> pretty_table(x, nosubheader=true, tf=tf), PT) +end \ No newline at end of file diff --git a/scripts/experiments_mill/datasets_mvtech.txt b/scripts/experiments_mill/datasets_mvtech.txt new file mode 100644 index 0000000..47c326d --- /dev/null +++ b/scripts/experiments_mill/datasets_mvtech.txt @@ -0,0 +1,4 @@ +hazelnut_together +pill_together +screw_together +toothbrush_together diff --git a/scripts/experiments_mill/knn_basic.jl b/scripts/experiments_mill/knn_basic.jl index 5aab5b8..4a0523f 100644 --- a/scripts/experiments_mill/knn_basic.jl +++ b/scripts/experiments_mill/knn_basic.jl @@ -95,14 +95,27 @@ edit_params = GroupAD.edit_params #################################################################### ################ THIS PART IS COMMON FOR ALL MODELS ################ if abspath(PROGRAM_FILE) == @__FILE__ - GroupAD.basic_experimental_loop( - sample_params, - fit, - edit_params, - max_seed, - modelname, - dataset, - contamination, - datadir("experiments/contamination-$(contamination)/MIL") + if in(dataset, mill_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/MIL"), ) -end + elseif in(dataset, mvtec_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/mv_tec") + ) + end +end \ No newline at end of file diff --git a/scripts/experiments_mill/poolmodel.jl b/scripts/experiments_mill/poolmodel.jl index 7129477..6302714 100644 --- a/scripts/experiments_mill/poolmodel.jl +++ b/scripts/experiments_mill/poolmodel.jl @@ -125,14 +125,27 @@ end ################ THIS PART IS COMMON FOR ALL MODELS ################ # only execute this if run directly - so it can be included in other files if abspath(PROGRAM_FILE) == @__FILE__ - GroupAD.basic_experimental_loop( - sample_params, - fit, - edit_params, - max_seed, - modelname, - dataset, - contamination, - datadir("experiments/contamination-$(contamination)/MIL") + if in(dataset, mill_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/MIL"), ) -end + elseif in(dataset, mvtec_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/mv_tec") + ) + end +end \ No newline at end of file diff --git a/scripts/experiments_mill/statistician.jl b/scripts/experiments_mill/statistician.jl index 4ad1374..edfa4f7 100644 --- a/scripts/experiments_mill/statistician.jl +++ b/scripts/experiments_mill/statistician.jl @@ -123,14 +123,27 @@ end ################ THIS PART IS COMMON FOR ALL MODELS ################ # only execute this if run directly - so it can be included in other files if abspath(PROGRAM_FILE) == @__FILE__ - GroupAD.basic_experimental_loop( - sample_params, - fit, - edit_params, - max_seed, - modelname, - dataset, - contamination, - datadir("experiments/contamination-$(contamination)/MIL") + if in(dataset, mill_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/MIL"), ) -end + elseif in(dataset, mvtec_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/mv_tec") + ) + end +end \ No newline at end of file diff --git a/scripts/experiments_mill/vae_basic.jl b/scripts/experiments_mill/vae_basic.jl index 8003e3e..b847f21 100644 --- a/scripts/experiments_mill/vae_basic.jl +++ b/scripts/experiments_mill/vae_basic.jl @@ -126,14 +126,27 @@ end ################ THIS PART IS COMMON FOR ALL MODELS ################ # only execute this if run directly - so it can be included in other files if abspath(PROGRAM_FILE) == @__FILE__ - GroupAD.basic_experimental_loop( - sample_params, - fit, - edit_params, - max_seed, - modelname, - dataset, - contamination, - datadir("experiments/contamination-$(contamination)/MIL") + if in(dataset, mill_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/MIL") ) + elseif in(dataset, mvtec_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/mv_tec") + ) + end end diff --git a/scripts/experiments_mill/vae_instance.jl b/scripts/experiments_mill/vae_instance.jl index 35340a2..69c64c6 100644 --- a/scripts/experiments_mill/vae_instance.jl +++ b/scripts/experiments_mill/vae_instance.jl @@ -139,14 +139,27 @@ end ################ THIS PART IS COMMON FOR ALL MODELS ################ # only execute this if run directly - so it can be included in other files if abspath(PROGRAM_FILE) == @__FILE__ - GroupAD.basic_experimental_loop( - sample_params, - fit, - edit_params, - max_seed, - modelname, - dataset, - contamination, - datadir("experiments/contamination-$(contamination)/MIL"), + if in(dataset, mill_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/MIL"), ) + elseif in(dataset, mvtec_datasets) + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/mv_tec") + ) + end end diff --git a/scripts/experiments_mvtec/vae_basic.jl b/scripts/experiments_mvtec/vae_basic.jl new file mode 100644 index 0000000..8003e3e --- /dev/null +++ b/scripts/experiments_mvtec/vae_basic.jl @@ -0,0 +1,139 @@ +using DrWatson +@quickactivate +using ArgParse +using GroupAD +import StatsBase: fit!, predict +using StatsBase +using BSON +using Flux +using GroupAD.GenerativeModels + +s = ArgParseSettings() +@add_arg_table! s begin + "max_seed" + arg_type = Int + help = "seed" + default = 1 + "dataset" + default = "Fox" + arg_type = String + help = "dataset" + "contamination" + default = 0.0 + arg_type = Float64 + help = "training data contamination rate" +end +parsed_args = parse_args(ARGS, s) +@unpack dataset, max_seed, contamination = parsed_args + +####################################################################################### +################ THIS PART IS TO BE PROVIDED FOR EACH MODEL SEPARATELY ################ +modelname = "vae_basic" +# sample parameters, should return a Dict of model kwargs +""" + sample_params() + +Should return a named tuple that contains a sample of model parameters. +""" +function sample_params() + par_vec = (2 .^(3:8), 2 .^(4:9), ["scalar", "diagonal"], 10f0 .^(-4:-3), 2 .^ (5:7), ["relu", "swish", "tanh"], 3:4, 1:Int(1e8), + ["mean", "maximum", "median"]) + argnames = (:zdim, :hdim, :var, :lr, :batchsize, :activation, :nlayers, :init_seed, :aggregation) + parameters = (;zip(argnames, map(x->sample(x, 1)[1], par_vec))...) + # ensure that zdim < hdim + while parameters.zdim >= parameters.hdim + parameters = merge(parameters, (zdim = sample(par_vec[1])[1],)) + end + return parameters +end + +""" + loss(model::GenerativeModels.VAE, x[, batchsize]) + +Negative ELBO for training of a VAE model. +""" +loss(model::GenerativeModels.VAE, x) = -elbo(model, x) +# version of loss for large datasets +loss(model::GenerativeModels.VAE, x, batchsize::Int) = + mean(map(y->loss(model,y), Flux.Data.DataLoader(x, batchsize=batchsize))) + +""" + fit(data, parameters) + +This is the most important function - returns `training_info` and a tuple or a vector of tuples `(score_fun, final_parameters)`. +`training_info` contains additional information on the training process that should be saved, the same for all anomaly score functions. +Each element of the return vector contains a specific anomaly score function - there can be multiple for each trained model. +Final parameters is a named tuple of names and parameter values that are used for creation of the savefile name. +""" +function fit(data, parameters) + # construct model - constructor should only accept kwargs + model = GroupAD.Models.vae_constructor(;idim=size(data[1][1],1), parameters...) + + # aggregate bags into vectors + # first convert the aggregation string to a function + agf = getfield(StatsBase, Symbol(parameters.aggregation)) + data = GroupAD.Models.aggregate(data, agf) + + # fit train data + try + global info, fit_t, _, _, _ = @timed fit!(model, data, loss; max_train_time=82800/max_seed, + patience=200, check_interval=10, parameters...) + catch e + # return an empty array if fit fails so nothing is computed + @info "Failed training due to \n$e" + return (fit_t = NaN, history=nothing, npars=nothing, model=nothing), [] + end + + # construct return information - put e.g. the model structure here for generative models + training_info = ( + fit_t = fit_t, + history = info.history, + npars = info.npars, + model = info.model + ) + + # now return the infor to be saved and an array of tuples (anomaly score function, hyperparatemers) + L=100 + batchsize=512 + training_info, [ + (x -> GroupAD.Models.reconstruction_score(info.model,x,agf), + merge(parameters, (score = "reconstruction", L=1))), + (x -> GroupAD.Models.reconstruction_score_mean(info.model,x,agf), + merge(parameters, (score = "reconstruction-mean", L=1))), + (x -> GroupAD.Models.reconstruction_score(info.model,x,agf,L), + merge(parameters, (score = "reconstruction-sampled", L=L))) + ] +end + +""" + edit_params(data, parameters) + +This function edits the sampled parameters based on nature of data - e.g. dimensions etc. Default +behaviour is doing nothing - then used `GroupAD.edit_params`. +""" +function edit_params(data, parameters) + idim = size(data[1][1].data.data,1) + # put the largest possible zdim where zdim < idim, the model tends to converge poorly if the latent dim is larger than idim + if parameters.zdim >= idim + zdims = 2 .^(1:8) + zdim_new = zdims[zdims .< idim][end] + parameters = merge(parameters, (zdim=zdim_new,)) + end + parameters +end + +#################################################################### +################ THIS PART IS COMMON FOR ALL MODELS ################ +# only execute this if run directly - so it can be included in other files +if abspath(PROGRAM_FILE) == @__FILE__ + GroupAD.basic_experimental_loop( + sample_params, + fit, + edit_params, + max_seed, + modelname, + dataset, + contamination, + datadir("experiments/contamination-$(contamination)/MIL") + ) +end diff --git a/src/GroupAD.jl b/src/GroupAD.jl index 979248a..eb69f30 100755 --- a/src/GroupAD.jl +++ b/src/GroupAD.jl @@ -11,6 +11,7 @@ using BSON using DataDeps using Mmap using Distributions +using HDF5 if occursin("Python/3.8.6-GCCcore-10.2.0", read(`which python`, String)) using PyCall end @@ -25,7 +26,7 @@ include("experimental_loops.jl") include("ipmeasures/IPMeasures.jl") include("generative_models/GenerativeModels.jl") include("models/Models.jl") -#include("evaluation/Evaluation.jl") +include("evaluation/Evaluation.jl") const mill_datasets = [ "BrownCreeper", "CorelAfrican", "CorelBeach", "Elephant", "Fox", "Musk1", "Musk2", @@ -33,6 +34,14 @@ const mill_datasets = [ "Tiger", "UCSBBreastCancer", "Web1", "Web2", "Web3", "Web4", "WinterWren" ] -export mill_datasets +const mvtec_datasets = [ + "capsule_together", + "hazelnut_together", + "pill_together", + "screw_together", + "toothbrush_together" +] + +export mill_datasets, mvtec_datasets end #module diff --git a/src/data.jl b/src/data.jl index 250080e..3e021e3 100644 --- a/src/data.jl +++ b/src/data.jl @@ -324,6 +324,40 @@ function load_ember(;normalize=true) X_train, y_train, X_tst, y_tst end +function load_sift(filename::String="capsule_together") + file = h5open(datadir("sift_mvtec", "$filename.h5")) + data = file["data"][:,:] + labels = file["labels"][:] + bagids = file["sizes"][:] + return data, labels, bagids +end + +function load_mvtec(dataset::String="capsule_together") + data, labels, bagids = load_sift() + idxes = Mill.length2bags([sum(bagids .== c) for c in sort(unique(bagids))]) + bags = Mill.BagNode(Mill.ArrayNode(data), idxes) + + obs0 = labels .== 0 + obs1 = labels .== 1 + + bagids0 = idxes[obs0] + bagids1 = idxes[obs1] + + dataix0 = vcat([collect(x) for x in bagids0]...) + dataix1 = vcat([collect(x) for x in bagids1]...) + f = [first(b) for b in bagids1] + l = [last(b) for b in bagids1] + l = l .- f[1] .+ 1 + f = f .- f[1] .+ 1 + + newbagids = [fi:li for (fi, li) in zip(f, l)] + + return ( + normal = BagNode(ArrayNode(hcat([data[:, b] for b in bagids0]...)), bagids0), + anomaly = BagNode(ArrayNode(hcat([data[:, b] for b in bagids1]...)), newbagids) + ) +end + """ seqids2bags(bagids) @@ -530,6 +564,8 @@ function load_data(dataset::String, ratios=(0.6,0.2,0.2); seed=nothing, method = data_normal, data_anomalous, _, _ = load_mnist_point_cloud(;kwargs...) elseif occursin("events", dataset) data_normal, data_anomalous = load_lhco(dataset; kwargs...) + elseif dataset in mvtec_datasets + data_normal, data_anomalous = load_mvtec(dataset; kwargs...) else data_normal, data_anomalous = load_mill_data(dataset; kwargs...) end