add modelnet to data, NS cleanup

aicenter · Oct 10, 2023 · 3af5e9c · 3af5e9c
1 parent 37a0ae2
commit 3af5e9c
Show file tree

Hide file tree

Showing 6 changed files with 134 additions and 119 deletions.
diff --git a/src/GroupAD.jl b/src/GroupAD.jl
@@ -19,6 +19,7 @@ using HDF5
 export GenerativeModels
 
 include("data.jl")
+include("modelnet.jl")
 include("toy.jl")
 include("experiments.jl")
 include("experimental_loops.jl")

diff --git a/src/data.jl b/src/data.jl
@@ -543,6 +543,8 @@ function load_data(dataset::String, ratios=(0.6,0.2,0.2); seed=nothing, method =
 		data_normal, data_anomalous = load_lhco(dataset; kwargs...)
 	elseif dataset in mvtec_datasets
 		data_normal, data_anomalous = load_mvtec(dataset; kwargs...)
+	elseif dataset == "modelnet"
+		return load_modelnet(;kwargs...)
 	else
 		data_normal, data_anomalous = load_mill_data(dataset; kwargs...)
 	end

diff --git a/src/experimental_loops.jl b/src/experimental_loops.jl
@@ -174,13 +174,19 @@ function point_cloud_experimental_loop(sample_params_f, fit_f, edit_params_f,
 					data = GroupAD.leave_one_in(data; seed=seed)
 				elseif method == "leave-one-out"
 					data = GroupAD.leave_one_out(data; seed=seed)
+				elseif dataset == "modelnet"
+					nothing
 				else
 					error("This model can only run on point cloud datasets!")
 				end
 
 				# define where data is going to be saved
 				# _savepath = joinpath(savepath, "$(modelname)/$(dataset)/$(method)/class_index=$(class)/seed=$(seed)")
-				_savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)")
+				if dataset == "modelnet"
+					_savepath = joinpath(savepath, "$(modelname)/$(method)/seed=$(seed)")
+				else
+					_savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)")
+				end
 				mkpath(_savepath)
 
 				# edit parameters
@@ -236,90 +242,6 @@ function point_cloud_experimental_loop(sample_params_f, fit_f, edit_params_f,
 	(try_counter == max_tries) ? (@info "Reached $(max_tries) tries, giving up.") : nothing
 end
 
-"""
-	point_cloud_experimental_loop(sample_params_f, fit_f, edit_params_f, 
-		max_seed, modelname, dataset, contamination, savepath, anomaly_classes, method)
-
-This function takes a function that samples parameters, a fit function and a function that edits the sampled
-parameters and other parameters. Then it loads data, samples hyperparameters, calls the fit function
-that is supposed to construct and fit a model and finally evaluates the returned score functions on 
-the loaded data.
-
-This function works for point cloud datasets. Differentiation between leave-one-in and leave-one-out
-setting is done via parameter `method`.
-
-This particular loop only loops over 1:max_seed with given anomaly class.
-"""
-function point_cloud_experimental_loop2(sample_params_f, fit_f, edit_params_f, 
-	max_seed, modelname, dataset, contamination, savepath, anomaly_class, method)
-	# sample the random hyperparameters
-	parameters = sample_params_f()
-
-	# for the given number of seeds run the experiment
-	for seed in 1:max_seed
-		# with these hyperparameters, train and evaluate the model on different train/val/tst splits
-		# load data for either "MNIST_in" or "MNIST_out" and set the setting
-		# prepared for other point cloud datasets such as ModelNet10
-
-		# load data
-		data = load_data(dataset, anomaly_class_ind=anomaly_class, seed=seed, method=method, contamination=contamination)
-
-		# undersample data for leave-one-in and leave-one-out methods
-		if method == "leave-one-in"
-			data = GroupAD.leave_one_in(data; seed=seed)
-		elseif method == "leave-one-out"
-			data = GroupAD.leave_one_out(data; seed=seed)
-		else
-			error("This model can only run on point cloud datasets!")
-		end
-
-		# define where data is going to be saved
-		_savepath = joinpath(savepath, "$(modelname)/$(dataset)/$(method)/class_index=$(class)/seed=$(seed)")
-		mkpath(_savepath)
-
-		# edit parameters
-		edited_parameters = edit_params_f(data, parameters, anomaly_class, method)
-
-		@info "Trying to fit $modelname on $(dataset) in $method setting.\nModel parameters: $(edited_parameters)..."
-		@info "Train/validation/test splits: $(size(data[1][1], 2)) | $(size(data[2][1], 2)) | $(size(data[3][1], 2))"
-		@info "Number of features: $(size(data[1][1], 1))"
-
-		@info "Params check done. Trying to fit."
-		# fit
-		training_info, results = fit_f(data, edited_parameters)
-
-		# save the model separately			
-		if training_info.model != nothing
-			modelf = joinpath(_savepath, savename("model", edited_parameters, "bson", digits=5))
-			tagsave(
-				modelf, 
-				Dict("model"=>training_info.model,
-					"fit_t"=>training_info.fit_t,
-					"history"=>training_info.history,
-					"parameters"=>edited_parameters
-					), 
-				safe = true)
-			(@info "Model saved to $modelf")
-
-			training_info = merge(training_info, (model = nothing, history=nothing))
-		end
-
-		# here define what additional info should be saved together with parameters, scores, labels and predict times
-		save_entries = merge(training_info, (modelname = modelname, seed = seed, dataset = dataset))
-
-		# now loop over all anomaly score funs
-		@time for result in results
-			if modelname in ["vae_instance", "statistician", "PoolModel"]
-				@info "Trying to save results..."
-				experiment_bag(result..., data, _savepath; save_entries...)
-			else
-				experiment(result..., data, _savepath; save_entries...)
-			end
-		end
-	end
-end
-
-
 """
 	toy_experimental_loop_toy(sample_params_f, fit_f, edit_params_f, 
 		max_seed, modelname, dataset, contamination, savepath)

diff --git a/src/modelnet.jl b/src/modelnet.jl
@@ -0,0 +1,84 @@
+using HDF5
+using Random
+using StatsBase
+using Mill
+
+function tensors_to_mill(data)
+    n = size(data, 2)
+    lengths = repeat([n], size(data, 3))
+    idxes = Mill.length2bags(lengths)
+    flatten = reshape(data, 3, n * size(data, 3))
+    return BagNode(ArrayNode(flatten), idxes)
+end
+
+function load_modelnet(npoints=2048; method="chair", validation::Bool=true, ratio=0.2, seed::Int=666, kwargs...)
+    # method is actually the class that we want out,
+    # because this is the only way not to change too many lines
+    (X_train, Y_train), (X_val, Y_val), (X_test, Y_test) = _load_modelnet10(npoints, method; validation=validation, ratio=ratio, seed=seed)
+    X_train = tensors_to_mill(X_train)
+    X_val = tensors_to_mill(X_val)
+    X_test = tensors_to_mill(X_test)
+    return (X_train, Y_train), (X_val, Y_val), (X_test, Y_test)
+end
+
+function train_test_split(X, y, ratio=0.2; seed=nothing)
+    # simple util function
+    (seed!==nothing) ? Random.seed!(seed) : nothing
+
+    N = size(X,3)
+    idx_samples = sample(1:N, Int(floor(N*ratio)), replace=false)
+    idx_bool = zeros(Bool,N)
+    idx_bool[idx_samples] .= true
+
+    X_val = X[:,:,idx_bool]
+    Y_val = y[idx_bool]
+    X_train = X[:,:,.!idx_bool]
+    Y_train = y[.!idx_bool]
+
+    (seed!==nothing) ? Random.seed!() : nothing
+    return (X_train, Y_train), (X_val, Y_val)
+end
+
+
+function _load_modelnet10(npoints=2048, type="all"; validation::Bool=true, ratio=0.2, seed::Int=666)
+    """
+    npoints     ... Number of points per object ( 512 / 1024 / 2048 )
+    type        ... Type data -> \"all\" or one-class name e.g. \"chair\", \"monitor\"
+    validation  ... Return validation set (\"true\") or not (\"false\")
+    seed        ... Random seed for validation split.
+    """
+
+    #load data
+    data = HDF5.h5open("/home/maskomic/projects/GroupAD.jl/data/modelnet10_$(npoints).h5")
+    X_train, X_test, Y_train, Y_test = data["X_train"]|>read, data["X_test"]|>read, data["Y_train"]|>read, data["Y_test"]|>read
+
+    titles = ["bathtub", "bed", "chair", "desk", "dresser", "monitor", "night_stand", "sofa", "table", "toilet"]
+
+    # if validation
+    #     (X_train,Y_train), (X_val,Y_val) = train_test_split(X_train, Y_train, ratio, seed=seed)
+    #     if type in titles
+    #         idx = findmax(titles .== type)[2]
+    #         X_train = X_train[:, :, Y_train .== idx]
+    #         Y_train = zeros(Bool,size(Y_train[Y_train .== idx]))
+    #         Y_val = Y_val .!= idx
+    #         Y_test = Y_test .!= idx
+    #     end
+    #     data = ((X_train, Y_train), (X_val, Y_val), (X_test, Y_test)) 
+    # else
+    #     if type in titles
+    #         idx = findmax(titles .== type)[2]
+    #         X_train = X_train[:, :, Y_train .== idx]
+    #         Y_train = zeros(Bool,size(Y_train[Y_train .== idx]))
+    #         Y_test = Y_test .!= idx
+    #     end
+    #     data = ((X_train, Y_train), (X_test, Y_test)) 
+    # end
+    (X_train,Y_train), (X_val,Y_val) = train_test_split(X_train, Y_train, ratio, seed=seed)
+    idx = findmax(titles .== type)[2]
+    X_train = X_train[:, :, Y_train .== idx]
+    Y_train = zeros(Bool,size(Y_train[Y_train .== idx]))
+    Y_val = Y_val .!= idx
+    Y_test = Y_test .!= idx
+    data = ((X_train, Y_train), (X_val, Y_val), (X_test, Y_test)) 
+    return data
+end
diff --git a/src/models/PoolModel.jl b/src/models/PoolModel.jl
@@ -23,28 +23,30 @@ transformation of all instances, the latter only transforms the one-vector summa
 The summary is created with a pooling function which has to be permutation invariant.
 Possible functions include: mean, sum, maximum, etc.
 """
-struct PoolModel{pre <: Chain, post <: Chain, fun <: Function, p <: ContinuousMultivariateDistribution, e <: ConditionalMvNormal, g <: ConditionalMvNormal, d <: Chain}
+struct PoolModel{pre <: Chain, post <: Chain, fun <: Function, g <: ConditionalMvNormal, d <: Chain}
     prepool_net::pre
     postpool_net::post
     poolf::fun
-    prior::p
-    encoder::e
     generator::g
     decoder::d
 end
 
 Flux.@functor PoolModel
 
 function Flux.trainable(m::PoolModel)
-    (prepool_net = m.prepool_net, postpool_net = m.postpool_net, encoder = m.encoder, generator = m.generator, decoder = m.decoder)
+    (prepool_net = m.prepool_net, postpool_net = m.postpool_net, generator = m.generator, decoder = m.decoder)
 end
 
-function PoolModel(pre, post, fun, gen, dec, enc::ConditionalMvNormal, plength::Int)
-    W = first(Flux.params(enc))
-    μ = fill!(similar(W, plength), 0)
-    σ = fill!(similar(W, plength), 1)
-    prior = DistributionsAD.TuringMvNormal(μ, σ)
-    PoolModel(pre, post, fun, prior, enc, gen, dec)
+# function PoolModel(pre, post, fun, gen, dec)
+#     PoolModel(pre, post, fun, gen, dec)
+# end
+
+function (m::PoolModel)(x)
+    v = m.prepool_net(x)
+    p = m.poolf(v)
+    p_post = m.postpool_net(p)
+    z = hcat([rand(m.generator, p_post) for i in 1:size(x, 2)]...)
+    m.decoder(z)
 end
 
 function Base.show(io::IO, pm::PoolModel)
@@ -88,25 +90,12 @@ function pm_constructor(;idim, hdim, predim, postdim, edim, activation="swish",
     )
 
     if var == "scalar"
-    # encoder
-        enc = Chain(
-            build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)...,
-            SplitLayer(hdim,[edim,1])
-        )
-        enc_dist = ConditionalMvNormal(enc)
-
         gen = Chain(
             build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)...,
             SplitLayer(hdim,[edim,1])
         )
         gen_dist = ConditionalMvNormal(gen)
     else
-        enc = Chain(
-            build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)...,
-            SplitLayer(hdim,[edim,edim])
-        )
-        enc_dist = ConditionalMvNormal(enc)
-
         gen = Chain(
             build_mlp(postdim,hdim,hdim,nlayers-1,activation=activation)...,
             SplitLayer(hdim,[edim,edim])
@@ -119,7 +108,7 @@ function pm_constructor(;idim, hdim, predim, postdim, edim, activation="swish",
         Dense(hdim,idim)
     )
 
-    pm = PoolModel(pre, post, fun, gen_dist, dec, enc_dist, edim)
+    pm = PoolModel(pre, post, fun, gen_dist, dec)
     return pm
 end
 
@@ -229,7 +218,8 @@ function StatsBase.fit!(model::PoolModel, data::Tuple, loss::Function;
 	for batch in RandomBatches(tr_x, 10)
 		# classic training
 		bag_batch = RandomBagBatches(tr_x,batchsize=batchsize,randomize=true)
-		Flux.train!(lossf, ps, [bag_batch], opt)
+		# Flux.train!(lossf, ps, [bag_batch], opt)
+        Flux.train!(lossf, ps, bag_batch, opt)
 		# only batch training loss
 		batch_loss = lossf(bag_batch) # mean(lossf.(bag_batch))
 

diff --git a/src/models/hmil.jl b/src/models/hmil.jl
@@ -159,17 +159,17 @@ function fit_hmil!(model::Chain, data::Tuple, loss::Function;
 	ps = Flux.params(tr_model)
 	_patience = patience
 
-	best_val_loss = Inf
-	i = 1
-	start_time = time()
-
 	# prepare data
 	tr, vl = classification_data(data, na, seed)
 	train_data, train_labels = tr[1], Flux.onehotbatch(tr[2], [0,1])
 	val_data, val_labels = vl[1], Flux.onehotbatch(vl[2], [0,1])
 
 	lossf(x, y) = loss(tr_model, x, y)
 
+	best_val_loss = Inf
+	i = 1
+	start_time = time()
+
 	# infinite for loop via RandomBatches
 	for batch in RandomBatches(train_data, 1)
 		# classic training
@@ -185,8 +185,13 @@ function fit_hmil!(model::Chain, data::Tuple, loss::Function;
 			# validation/early stopping
 			# val_loss = lossf(bag_batch) # mean(lossf.(val_x))
 			val_loss = lossf(val_data, val_labels)
+			train_loss = lossf(train_data, train_labels)
+
+			acc = mean(Flux.onecold(tr_model(train_data), [0,1]) .== Flux.onecold(train_labels, [0,1]))
 
 			@info "$i - loss: $(batch_loss) (batch) | $(val_loss) (validation)"
+			@info "train loss: $train_loss"
+			@info "train accuracy: $acc"
 
 			if isnan(val_loss) || isnan(batch_loss)
 				error("Encountered invalid values in loss function.")
@@ -227,7 +232,7 @@ function hmil_basic_loop(sample_params_f, fit_f, edit_params_f,
 	# sample the random hyperparameters
 	parameters = sample_params_f()
 
-	for na in [100,10,20]
+	for na in [100,5,10,20]
 	# for na in 0
 		# with these hyperparameters, train and evaluate the model on different train/val/tst splits
 		for seed in 1:max_seed
@@ -282,11 +287,11 @@ function hmil_pc_loop(sample_params_f, fit_f, edit_params_f,
 	# sample the random hyperparameters
 	parameters = sample_params_f()
 
-	for na in [100,10,20]
+	Threads.@threads for na in [10, 20, 100]
 		# run over all classes with the same hyperparameters
 		# use more CPU cores for calculation
 		@info "Starting parallel process on $(Threads.nthreads()) cores (over $max_seed seeds)."
-		Threads.@threads for seed in 1:max_seed
+		for seed in 1:max_seed
 			# with these hyperparameters, train and evaluate the model on different train/val/tst splits
 			# load data for either "MNIST_in" or "MNIST_out" and set the setting
 			# prepared for other point cloud datasets such as ModelNet10
@@ -296,13 +301,24 @@ function hmil_pc_loop(sample_params_f, fit_f, edit_params_f,
 					data = GroupAD.leave_one_in(data; seed=seed)
 				elseif method == "leave-one-out"
 					data = GroupAD.leave_one_out(data; seed=seed)
+				elseif dataset == "modelnet"
+					# data = (
+					# 	(data[1][1], .!data[1][2]),
+					# 	(data[2][1], .!data[2][2]),
+					# 	(data[3][1], .!data[3][2])
+					# )
+					nothing
 				else
 					error("This model can only run on point cloud datasets!")
 				end
 
 				# define where data is going to be saved
 				# _savepath = joinpath(savepath, "$(modelname)/$(dataset)/$(method)/class_index=$(class)/seed=$(seed)")
-				_savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)")
+				if dataset == "modelnet"
+					_savepath = joinpath(savepath, "$(modelname)/$(method)/seed=$(seed)")
+				else
+					_savepath = joinpath(savepath, "$(modelname)/$(method)/class_index=$(class)/seed=$(seed)")
+				end
 				mkpath(_savepath)
 
 				# edit parameters
@@ -349,4 +365,4 @@ end
 
 function get_label_hmil(model, x)
     Flux.onecold(model(x), [0,1])
-end
+end