-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFormulation_DoE.R
288 lines (190 loc) · 14 KB
/
Formulation_DoE.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
### Design of Experiments for Liquid Formulations ###
## This code presents a model-weighted space filling DoE methodology to prepare shampoo formulations. ##
## The model-component is related to a phase stability classifier to drive the formulations towards regions of stability. ##
## The machine learning component is done in Python in a separate jupyter notebook "PhaseStability_ML.ipynb"
## The space-filling component utilises the MaxProQQ package written by Joseph, Gul & Ba, 2020. ##
# Clearing workspace and setting working directory ----
rm(list=ls()) # This clears the workspace, ensuring that any variables saved in the environment do not effect our current code.
setwd("/Users/ac2349/R/DoE") # Setting working directory, change as necessary.
# Installing/ loading libraries required to run this code ----
#install.packages('MaxPro') # one-time installation
#install.packages('caret')
#install.packages('purrr')
library(caret)
library(MaxPro)
library(purrr)
# Function definitions for programme ----
# The MaxPro DoE package generates variables on the interval [0,1]
# Need to be able to inter convert between these variables and concentrations
# convert design variables [0,1] to concentrations
conc_converter <- function(x,LB,UB) {
conc <- x*(UB - LB) + LB
return(conc)
}
# convert concentrations to [0,1]
invert_conc <- function(conc,LB,UB) {
x <- (conc - LB)/(UB - LB)
return(x)
}
# Pre-defined concentration bounds for formulations ingredients
S_LB <- 8.0
S_UB <- 13.0
P_LB <- 1.0
P_UB <- 3.0
T_LB <- 1.0
T_UB <- 5.0
# Sample density = 1/sum(weight_frac_ingredient/density_ingredient)
# Below function takes a dataframe with the ingredient mass % in a particular sample and returns its density
calc_density <- function(df,species_dict=ingredients_info) {
active_ingredients <- names(df)[df != 0]
active_conc <- df[, active_ingredients]
vol_frac <- 0.0 # initalise variable
for (ingredient in colnames(active_conc)) { # iterate over each active ingredient
ingredient_density <- as.numeric(species_dict[species_dict['name'] == ingredient][2])
vol_frac <- vol_frac + (active_conc[,ingredient]/100)/ingredient_density
} # equivalent to summing over all ingredients
return(1.0/vol_frac) # returns the sample density
}
ingredients_info <- read.csv('SpeciesDictionary.csv') # csv file contains densities for ingredients.
# Takes a candidate set of experiments generated via the MaxPro package and converts them to a format consistent with the csv file sent to the OT-2 robot.
Cand_to_Dataset <- function(Cand_matrix, surf_pairs, OT_example_csv, n_samples, polymer, thickener, calc_density=FALSE) {
# First input is a matrix with 5 variables - the candidate experiments.
# First four variables correspond to concentrations: C_S1, C_S2, C_P, C_T
# Fifth variable is a discrete factor representing the choice of surfactant pair.
# Create a data frame placeholder to populate with the candidate experiments
DoE_cols <- colnames(OT_example_csv) # the column names should match the csv file sent to the Opentrons robot.
Cand_DoE <- data.frame(matrix(nrow=n_samples, ncol=length(DoE_cols))) # create an empty dataframe
colnames(Cand_DoE) <- DoE_cols # rename the data frame columns to match with the target csv
# Populate the ID and Sample rows - currently the same, but created 2 separate rows in case there are repeat experiments performed.
Cand_DoE[,"ID"] <- 1:n_samples
Cand_DoE[,"Sample"] <- 1:n_samples
Cand_df <- data.frame(Cand_matrix) # convert candidate matrix to data frame
# Populate all remaining NA values as 0. Non-active ingredient concentrations will = 0.
Cand_DoE[is.na(Cand_DoE)] = 0
# The surfactant pairs have been encoded into 66 nominal factors - need to convert between these and the selected surfactants.
S_factor = Cand_matrix[,5] # read the factor for each sample in the candidate set - column 5, the last column corresponds this categorical factor. Columns 1-4 are concentrations.
# read off S1, S2
S1 = surf_pairs[S_factor,2]
S2 = surf_pairs[S_factor,3]
surf_names <- colnames(OT_example_csv[,3:14])
for (s in surf_names) {
S1_idx <- which(S1 %in% s)
S2_idx <- which(S2 %in% s)
Cand_DoE[S1_idx,3:14][s] <- conc_converter(Cand_df[S1_idx,1], S_LB, S_UB)
Cand_DoE[S2_idx,3:14][s] <- conc_converter(Cand_df[S2_idx,2], S_LB, S_UB)
}
Cand_DoE[polymer] <- conc_converter(Cand_df[,3], P_LB, P_UB)
Cand_DoE[thickener] <- conc_converter(Cand_df[,4], T_LB, T_UB)
Cand_DoE[,"Water"] <- 100 - rowSums(Cand_DoE[,3:20]) # Mass % must sum to 100. The remaining content in the formulations is water.
if (calc_density == FALSE) {
Cand_DoE[,"Sample Density"] <- 1.00 # assuming the sample density = 1 (this is a very good approximation & avoids doing the row-wise operation, over possibly 100000s candidate samples)
} else {
# Extract out the ingredient names from the candidate dataframe.
ingredient_list <- colnames(Cand_DoE[,3:21]) # indices 3:21 correspond to the 19 ingredients, including water.
sample_density <- c() # Generate an empty vector to store the density values of the sample.
# Compute the sample density using the pre-defined density calculation function above.
for (idx in 1:n_samples) { # iterate through the samples 1-by-1
x <- Cand_DoE[idx, ingredient_list]
sample_density <- append(sample_density, calc_density(x)) # append the density to the vector
}
Cand_DoE[,"Sample Density"] <- sample_density
}
Cand_DoE[,3:22] <- round(Cand_DoE[,3:22], digits=2)
return(Cand_DoE)
}
# Candidate DoE points ----
set.seed(42) # fix the random seed so to ensure reproducibility between code runs.
N <- 360000 # number of candidate points, equivalent to all combinations if the conc. space was discretised into 0.5 w/w% intervals.
C <- CandPoints(N, p_cont=4, l_nom=66) # 4 continuous variables (S1, S2, P, T concentrations) & 66 possible surfactant pairs from 12 surf ingredients.
Cand_df <- data.frame(C) # convert candidate matrix to data frame
# Write out the candidate design to a .csv file to be read by Pandas - need this to filter into the restricted candidate set based on phase stability predictions
write.csv(Cand_df, 'CandDesign.csv') # Note design here refers to the DoE design with variables in the interval [0,1]
OT_example_csv <- read.csv('OT-2_DoE_Example.csv', check.names = FALSE)
surf_pairs <- read.csv('SurfactantPairs.csv') # csv file to read which factor corresponds to which surfactant pair.
# ENSURE CORRECT POLYMER, THICKENER SELECTED!!
polymer = "Dehyquart CC7 Benz"
thickener = "Arlypon TT"
Cand_DoE <- Cand_to_Dataset(C, surf_pairs, OT_example_csv, N, polymer, thickener) # Converting the MaxPro candidate points to the format of the formulations dataset
#View(Cand_DoE)
# Read the formulations data and features ----
my_data <- read.csv("PhDFormulationsDataset_2023.csv", check.names=FALSE) # read the master dataset containing the samples according to their mass composition & their properties.
#my_data <- rbind(my_data[523:582, ], my_data[619:666, ]) # select only the columns with data for the current (P, T) combination.
my_data <- my_data[751:822, ]
surf_conc_expt <- my_data[2:13] # extract out the data for the 12 surfactants (Texapon SB 3 KC ... Dehyquart A-CA)
surf_conc_expt <- surf_conc_expt[complete.cases(surf_conc_expt), ] # drop any fully empty rows (sometimes erroneously saved into the dataset csv).
# Extract the surfactant concentrations from the candidate DoE
surf_conc_cand <- Cand_DoE[,3:14] # indices 3:14 correspond to Texapon SB 3 KC ... Dehyquart A-CA
# Extract the concentration for the conditioning polymer (P) and thickener (T) - ensure to change ingredient name according to the choice of P,T used in the experiment.
poly_thick_expt <- data.frame(cbind(my_data[polymer], my_data[thickener])) # ARE THE CHOSEN P, T CORRECT?
poly_thick_expt <- poly_thick_expt[complete.cases(poly_thick_expt), ]
colnames(poly_thick_expt) <- c("P", "T") # rename the columns in the P,T dataframe.
# extract out the P, T concentrations for the candidate set
poly_thick_cand <- data.frame(cbind(Cand_DoE[][polymer], Cand_DoE[][thickener]))
colnames(poly_thick_cand) <- c("P", "T") # rename the columns in the P,T dataframe.
surf_features <- read.csv("Surf_FGs.csv", row.names=1, check.names = FALSE) # read the data file containing the scaled FG count for each surfactant and its average chain length.
# check.names = FALSE so it doesn't change how the ingredient names are written - important for indexing.
# Pre-processing: calculate and scale the sample features ----
# Calculate the sample features via matrix multiplication of the surfactant conc. and features + append the P, T concentrations
expt_sample_features <- as.matrix(surf_conc_expt) %*% as.matrix(surf_features) # matrix dot product between the surfactant concentrations and their features results in a (#n_samples, #n_features) matrix.
expt_sample_features <- cbind(expt_sample_features, poly_thick_expt) # append the P, T concentrations onto the features
cand_sample_features <- as.matrix(surf_conc_cand) %*% as.matrix(surf_features) # matrix dot product between the surfactant concentrations and their features results in a (#n_samples, #n_features) matrix.
cand_sample_features <- cbind(cand_sample_features, poly_thick_cand) # append the P, T concentrations onto the features
# Feature scaling
preProc <- preProcess(expt_sample_features, method = c("range")) # use min-max scaling in the caret package to pre-process the data.
X_expt <- predict(preProc, expt_sample_features)
X_cand <- predict(preProc, cand_sample_features) # ensured the candidate set has been pre-processed in the same way.
# Extract the response variable (phase stability) from the experimental dataset
y <- as.factor(my_data$Stability_Test[complete.cases(my_data$Stability_Test)]) # read the output (stability) from the formulations dataset and it must be coded as a factor in R.
X_expt <- round(X_expt, digits=2)
featurePlot(X_expt, y, plot="strip") # Plot the features grouped by stability response for feature exploration/visualisation
df <- cbind(X_expt, y) # add the output to the input features, X, to generate the overall dataset, df.
df <- as.data.frame(df) # this will be used for machine learning.
# Machine Learning part in Python, write out csv files and read back in csv files ----
write.csv(df, "ExperimentalSet.csv") # experimental data with the surfactant features, polymer/thickener concentrations & phase stability responses
write.csv(X_cand, "CandidateSet.csv") # Note set refers to the candidate set of sample experiments in the concentrations space.
# A variety of machine learning models are trained and tuned in Python on the stability classification problem
# The most promising model at any given iteration (of the formulations workflow, i.e. week-on-week) is used to predict the phase stability of the candidate points
# Depending on the predicted likelihood of preparing stable formulations, the candidate design is filtered into a restricted set of possible experiments. This is what MaxProAugment chooses from.
Restricted_Cand_Design <- read.csv('RestrictedCandDesign.csv')
Restricted_Cand_Design <- Restricted_Cand_Design[,2:6]
# Convert the experimental data into the candidate set format ----
Sample_ID <- c(1:nrow(surf_conc_expt))
df_surf_expt <- cbind(Sample_ID, surf_conc_expt)
expt_surf_pairs <- df_surf_expt %>% split(.$Sample_ID) %>% map(~names(.x)[!!.x][-1])
# Create an empty dataframe with these column names.
Exist_Design <- data.frame(matrix(nrow=nrow(df_surf_expt), ncol=length(Cand_df)))
colnames(Exist_Design) = colnames(Cand_df)
for (idx in 1:nrow(surf_conc_expt)) {
S1 = unlist(expt_surf_pairs[idx])[1]
S2 = unlist(expt_surf_pairs[idx])[2]
Exist_Design[idx,1] <- invert_conc(surf_conc_expt[idx,][S1], S_LB, S_UB)
Exist_Design[idx,2] <- invert_conc(surf_conc_expt[idx,][S2], S_LB, S_UB)
Exist_Design[idx,3] <- invert_conc(poly_thick_expt[idx, ]["P"], P_LB, P_UB)
Exist_Design[idx,4] <- invert_conc(poly_thick_expt[idx, ]["T"], T_LB, T_UB)
Exist_Design[idx,5] <- which(surf_pairs[,2] == S1 & surf_pairs[,3] == S2)
}
# Write out Existing Design
write.csv(Exist_Design, "P3_T1_Design.csv")
# Generate new sample points ----
nNew <- 36
#New_Design <- MaxProAugment(as.matrix(Exist_Design), Restricted_Cand_Design, nNew, p_nom=1, l_nom=66)$Design # run this line for ML-guided DoE
New_Design <- MaxProAugment(as.matrix(Exist_Design), C, nNew, p_nom=1, l_nom=66)$Design # run this line for purely space filling DoE
#new_samples <- New_Design[(nrow(New_Design) - nNew + 1):nrow(New_Design), ]
#new_samples
Expt_DoE <- Cand_to_Dataset(New_Design, surf_pairs, OT_example_csv, nrow(New_Design), polymer, thickener, calc_density = TRUE) # Converting the MaxPro candidate points to the format of the formulations dataset
View(Expt_DoE)
path_out = '/Users/ac2349/R/DoE/SuggestedExperiments/' # change this path to the folder where you want to save the suggested experiments csv file.
fileName = paste(path_out, 'P3-T1_OT_DoE_0410-09-2023.csv', sep = '')
write.csv(Expt_DoE, fileName, row.names = FALSE)
# Initialise dataset for new polymer, thickener combination ----
set.seed(42) # fix the random seed so to ensure reproducibility between code runs.
N_P3_T1_init <- 24
P3_T1_init_design <- CandPoints(N_P3_T1_init, p_cont=4, l_nom=66) # 4 continuous variables (S1, S2, P, T concentrations) & 66 possible surfactant pairs from 12 surf ingredients.
# ENSURE CORRECT POLYMER, THICKENER SELECTED!!
polymer = "Dehyquart CC7 Benz"
thickener = "Arlypon TT"
P3_T1_init <- Cand_to_Dataset(P3_T1_init_design, surf_pairs, OT_example_csv, N_P3_T1_init, polymer, thickener, calc_density = TRUE)
View(P3_T1_init)
path_out = '/Users/ac2349/R/DoE/SuggestedExperiments/'
fileName = paste(path_out, 'MasterDataset_OT_DoE_P3_T1_init.csv', sep = '')
write.csv(P3_T1_init, fileName, row.names = FALSE)