From 89a0bd4ca73f3b4afce845482b5cc49df46cfe59 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Thu, 19 Dec 2024 19:05:40 +0100 Subject: [PATCH 1/9] replace old vignettes and examples --- R-package/DESCRIPTION | 3 +- R-package/R/xgb.importance.R | 111 +- R-package/R/xgb.plot.deepness.R | 23 +- R-package/R/xgb.plot.importance.R | 13 +- R-package/R/xgb.plot.multi.trees.R | 21 +- R-package/R/xgb.plot.shap.R | 51 +- R-package/R/xgb.plot.tree.R | 22 +- R-package/R/xgboost.R | 29 +- R-package/man/xgb.importance.Rd | 111 +- R-package/man/xgb.plot.deepness.Rd | 23 +- R-package/man/xgb.plot.importance.Rd | 13 +- R-package/man/xgb.plot.multi.trees.Rd | 21 +- R-package/man/xgb.plot.shap.Rd | 51 +- R-package/man/xgb.plot.tree.Rd | 22 +- R-package/man/xgboost.Rd | 26 +- R-package/vignettes/vignette.css | 225 ---- R-package/vignettes/xgboost.bib | 28 - R-package/vignettes/xgboost_introduction.qmd | 196 ++++ doc/R-package/discoverYourData.md | 475 -------- doc/R-package/index.rst | 3 +- doc/R-package/xgboostPresentation.md | 589 ---------- doc/R-package/xgboost_introduction.md | 1012 ++++++++++++++++++ 22 files changed, 1463 insertions(+), 1605 deletions(-) delete mode 100644 R-package/vignettes/vignette.css delete mode 100644 R-package/vignettes/xgboost.bib create mode 100644 R-package/vignettes/xgboost_introduction.qmd delete mode 100644 doc/R-package/discoverYourData.md delete mode 100644 doc/R-package/xgboostPresentation.md create mode 100644 doc/R-package/xgboost_introduction.md diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index d718bc2f72ce..524430ba7b2a 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -45,10 +45,11 @@ License: Apache License (== 2.0) | file LICENSE URL: https://github.com/dmlc/xgboost BugReports: https://github.com/dmlc/xgboost/issues NeedsCompilation: yes -VignetteBuilder: knitr +VignetteBuilder: knitr, quarto Suggests: knitr, rmarkdown, + quarto, ggplot2 (>= 1.0.1), DiagrammeR (>= 0.9.0), DiagrammeRsvg, diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index e272405d7e54..3f8cd2edf720 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -38,85 +38,64 @@ #' (based on C++ code), it starts at 0 (as in C/C++ or Python) instead of 1 (usual in R). #' #' @examples -#' -#' # binomial classification using "gbtree": -#' data(agaricus.train, package = "xgboost") -#' -#' bst <- xgb.train( -#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), -#' nrounds = 2, -#' params = xgb.params( -#' max_depth = 2, -#' nthread = 2, -#' objective = "binary:logistic" -#' ) +#' # binary classification using "gbtree": +#' data("ToothGrowth") +#' x <- ToothGrowth[, c("len", "dose")] +#' y <- ToothGrowth$supp +#' model_tree_binary <- xgboost( +#' x, y, +#' nrounds = 5L, +#' nthreads = 1L, +#' booster = "gbtree", +#' max_depth = 2L #' ) -#' -#' xgb.importance(model = bst) -#' -#' # binomial classification using "gblinear": -#' bst <- xgb.train( -#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), -#' nrounds = 20, -#' params = xgb.params( -#' booster = "gblinear", -#' learning_rate = 0.3, -#' nthread = 1, -#' objective = "binary:logistic" -#' ) +#' xgb.importance(model_tree_binary) +#' +#' # binary classification using "gblinear": +#' model_tree_linear <- xgboost( +#' x, y, +#' nrounds = 5L, +#' nthreads = 1L, +#' booster = "gblinear", +#' learning_rate = 0.3 #' ) -#' -#' xgb.importance(model = bst) -#' -#' # multiclass classification using "gbtree": -#' nclass <- 3 -#' nrounds <- 10 -#' mbst <- xgb.train( -#' data = xgb.DMatrix( -#' as.matrix(iris[, -5]), -#' label = as.numeric(iris$Species) - 1 -#' ), -#' nrounds = nrounds, -#' params = xgb.params( -#' max_depth = 3, -#' nthread = 2, -#' objective = "multi:softprob", -#' num_class = nclass -#' ) +#' xgb.importance(model_tree_linear) +#' +#' # multi-class classification using "gbtree": +#' data("iris") +#' x <- iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")] +#' y <- iris$Species +#' model_tree_multi <- xgboost( +#' x, y, +#' nrounds = 5L, +#' nthreads = 1L, +#' booster = "gbtree", +#' max_depth = 3 #' ) -#' #' # all classes clumped together: -#' xgb.importance(model = mbst) -#' +#' xgb.importance(model_tree_multi) #' # inspect importances separately for each class: +#' num_classes <- 3L +#' nrounds <- 5L #' xgb.importance( -#' model = mbst, trees = seq(from = 1, by = nclass, length.out = nrounds) +#' model_tree_multi, trees = seq(from = 1, by = num_classes, length.out = nrounds) #' ) #' xgb.importance( -#' model = mbst, trees = seq(from = 2, by = nclass, length.out = nrounds) +#' model_tree_multi, trees = seq(from = 2, by = num_classes, length.out = nrounds) #' ) #' xgb.importance( -#' model = mbst, trees = seq(from = 3, by = nclass, length.out = nrounds) +#' model_tree_multi, trees = seq(from = 3, by = num_classes, length.out = nrounds) #' ) #' -#' # multiclass classification using "gblinear": -#' mbst <- xgb.train( -#' data = xgb.DMatrix( -#' scale(as.matrix(iris[, -5])), -#' label = as.numeric(iris$Species) - 1 -#' ), -#' nrounds = 15, -#' params = xgb.params( -#' booster = "gblinear", -#' learning_rate = 0.2, -#' nthread = 1, -#' objective = "multi:softprob", -#' num_class = nclass -#' ) +#' # multi-class classification using "gblinear": +#' model_linear_multi <- xgboost( +#' x, y, +#' nrounds = 5L, +#' nthreads = 1L, +#' booster = "gblinear", +#' learning_rate = 0.2 #' ) -#' -#' xgb.importance(model = mbst) -#' +#' xgb.importance(model_linear_multi) #' @export xgb.importance <- function(model = NULL, feature_names = getinfo(model, "feature_name"), trees = NULL) { diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R index d6ba9c3d2411..c1c5c96b96b4 100644 --- a/R-package/R/xgb.plot.deepness.R +++ b/R-package/R/xgb.plot.deepness.R @@ -49,27 +49,24 @@ #' data.table::setDTthreads(nthread) #' #' ## Change max_depth to a higher number to get a more significant result -#' bst <- xgb.train( -#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), +#' model <- xgboost( +#' agaricus.train$data, factor(agaricus.train$label), #' nrounds = 50, -#' params = xgb.params( -#' max_depth = 6, -#' nthread = nthread, -#' objective = "binary:logistic", -#' subsample = 0.5, -#' min_child_weight = 2 -#' ) +#' max_depth = 6, +#' nthreads = nthread, +#' subsample = 0.5, +#' min_child_weight = 2 #' ) #' -#' xgb.plot.deepness(bst) -#' xgb.ggplot.deepness(bst) +#' xgb.plot.deepness(model) +#' xgb.ggplot.deepness(model) #' #' xgb.plot.deepness( -#' bst, which = "max.depth", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2 +#' model, which = "max.depth", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2 #' ) #' #' xgb.plot.deepness( -#' bst, which = "med.weight", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2 +#' model, which = "med.weight", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2 #' ) #' #' @rdname xgb.plot.deepness diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index 8acec15a0dd5..e6c1323be4b2 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -50,17 +50,14 @@ #' nthread <- 2 #' data.table::setDTthreads(nthread) #' -#' bst <- xgb.train( -#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), +#' model <- xgboost( +#' agaricus.train$data, factor(agaricus.train$label), #' nrounds = 2, -#' params = xgb.params( -#' max_depth = 3, -#' nthread = nthread, -#' objective = "binary:logistic" -#' ) +#' max_depth = 3, +#' nthreads = nthread #' ) #' -#' importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst) +#' importance_matrix <- xgb.importance(model) #' xgb.plot.importance( #' importance_matrix, rel_to_first = TRUE, xlab = "Relative importance" #' ) diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index 39966271cbb2..4e3caeaff509 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -36,26 +36,23 @@ #' nthread <- 2 #' data.table::setDTthreads(nthread) #' -#' bst <- xgb.train( -#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), +#' model <- xgboost( +#' agaricus.train$data, factor(agaricus.train$label), #' nrounds = 30, -#' verbose = 0, -#' params = xgb.params( -#' max_depth = 15, -#' learning_rate = 1, -#' nthread = nthread, -#' objective = "binary:logistic", -#' min_child_weight = 50 -#' ) +#' verbosity = 0L, +#' nthreads = nthread, +#' max_depth = 15, +#' learning_rate = 1, +#' min_child_weight = 50 #' ) #' -#' p <- xgb.plot.multi.trees(model = bst, features_keep = 3) +#' p <- xgb.plot.multi.trees(model, features_keep = 3) #' print(p) #' #' # Below is an example of how to save this plot to a file. #' if (require("DiagrammeR") && require("DiagrammeRsvg") && require("rsvg")) { #' fname <- file.path(tempdir(), "tree.pdf") -#' gr <- xgb.plot.multi.trees(bst, features_keep = 3, render = FALSE) +#' gr <- xgb.plot.multi.trees(model, features_keep = 3, render = FALSE) #' export_graph(gr, fname, width = 1500, height = 600) #' } #' @export diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index 43b8770271da..116387f1d790 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -81,51 +81,44 @@ #' data.table::setDTthreads(nthread) #' nrounds <- 20 #' -#' bst <- xgb.train( -#' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), +#' model_binary <- xgboost( +#' agaricus.train$data, factor(agaricus.train$label), #' nrounds = nrounds, -#' verbose = 0, -#' params = xgb.params( -#' learning_rate = 0.1, -#' max_depth = 3, -#' subsample = 0.5, -#' objective = "binary:logistic", -#' nthread = nthread -#' ) +#' verbosity = 0L, +#' learning_rate = 0.1, +#' max_depth = 3L, +#' subsample = 0.5, +#' nthreads = nthread #' ) #' -#' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") +#' xgb.plot.shap(agaricus.test$data, model = model_binary, features = "odor=none") #' -#' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) -#' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3) +#' contr <- predict(model_binary, agaricus.test$data, type = "contrib") +#' xgb.plot.shap(agaricus.test$data, contr, model = model_binary, top_n = 12, n_col = 3) #' #' # Summary plot -#' xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) +#' xgb.ggplot.shap.summary(agaricus.test$data, contr, model = model_binary, top_n = 12) #' #' # Multiclass example - plots for each class separately: -#' nclass <- 3 #' x <- as.matrix(iris[, -5]) #' set.seed(123) #' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values #' -#' mbst <- xgb.train( -#' data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1), +#' model_multiclass <- xgboost( +#' x, iris$Species, #' nrounds = nrounds, -#' verbose = 0, -#' params = xgb.params( -#' max_depth = 2, -#' subsample = 0.5, -#' nthread = nthread, -#' objective = "multi:softprob", -#' num_class = nclass -#' ) +#' verbosity = 0, +#' max_depth = 2, +#' subsample = 0.5, +#' nthreads = nthread #' ) +#' nclass <- 3 #' trees0 <- seq(from = 1, by = nclass, length.out = nrounds) #' col <- rgb(0, 0, 1, 0.5) #' #' xgb.plot.shap( #' x, -#' model = mbst, +#' model = model_multiclass, #' trees = trees0, #' target_class = 0, #' top_n = 4, @@ -137,7 +130,7 @@ #' #' xgb.plot.shap( #' x, -#' model = mbst, +#' model = model_multiclass, #' trees = trees0 + 1, #' target_class = 1, #' top_n = 4, @@ -149,7 +142,7 @@ #' #' xgb.plot.shap( #' x, -#' model = mbst, +#' model = model_multiclass, #' trees = trees0 + 2, #' target_class = 2, #' top_n = 4, @@ -160,7 +153,7 @@ #' ) #' #' # Summary plot -#' xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) +#' xgb.ggplot.shap.summary(x, model = model_multiclass, target_class = 0, top_n = 4) #' #' @rdname xgb.plot.shap #' @export diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 0d93f871ea6b..54f9d0362cdc 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -37,25 +37,23 @@ #' line. #' #' @examples -#' data(agaricus.train, package = "xgboost") -#' -#' bst <- xgb.train( -#' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), -#' nrounds = 2, -#' params = xgb.params( -#' max_depth = 3, -#' nthread = 2, -#' objective = "binary:logistic" -#' ) +#' data("ToothGrowth") +#' x <- ToothGrowth[, c("len", "dose")] +#' y <- ToothGrowth$supp +#' model <- xgboost( +#' x, y, +#' nthreads = 1L, +#' nrounds = 3L, +#' max_depth = 3L #' ) #' #' # plot the first tree -#' xgb.plot.tree(model = bst, tree_idx = 1) +#' xgb.plot.tree(model, tree_idx = 1) #' #' # Below is an example of how to save this plot to a file. #' if (require("DiagrammeR") && require("htmlwidgets")) { #' fname <- file.path(tempdir(), "plot.html'") -#' gr <- xgb.plot.tree(bst, tree_idx = 1) +#' gr <- xgb.plot.tree(model, tree_idx = 1) #' htmlwidgets::saveWidget(gr, fname) #' } #' @export diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index b62c25266269..49570032f511 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -856,12 +856,13 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' See the tutorial [Introduction to Boosted Trees](https://xgboost.readthedocs.io/en/stable/tutorials/model.html) #' for a longer explanation of what XGBoost does. #' -#' This function is intended to provide a more user-friendly interface for XGBoost that follows +#' This function is intended to provide a user-friendly interface for XGBoost that follows #' R's conventions for model fitting and predictions, but which doesn't expose all of the #' possible functionalities of the core XGBoost library. #' #' See [xgb.train()] for a more flexible low-level alternative which is similar across different -#' language bindings of XGBoost and which exposes the full library's functionalities. +#' language bindings of XGBoost and which exposes additional functionalities such as training on +#' external memory data and learning-to-rank objectives. #' #' @details #' For package authors using 'xgboost' as a dependency, it is highly recommended to use @@ -1045,7 +1046,29 @@ check.early.stopping.rounds <- function(early_stopping_rounds, eval_set) { #' # Task objective is determined automatically according to the type of 'y' #' data(iris) #' model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5) -#' predict(model_classif, iris, validate_features = TRUE) +#' predict(model_classif, iris[1:10,]) +#' predict(model_classif, iris[1:10,], type = "class") +#' +#' # Can nevertheless choose a non-default objective if needed +#' model_poisson <- xgboost( +#' mtcars[, -1], mtcars$mpg, +#' objective = "count:poisson", +#' nthreads = 1, +#' nrounds = 3 +#' ) +#' +#' # Can calculate evaluation metrics during boosting rounds +#' data(ToothGrowth) +#' xgboost( +#' ToothGrowth[, c("len", "dose")], +#' ToothGrowth$supp, +#' eval_metric = c("auc", "logloss"), +#' eval_set = 0.2, +#' monitor_training = TRUE, +#' verbosity = 1, +#' nthreads = 1, +#' nrounds = 3 +#' ) xgboost <- function( x, y, diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 8c2261bd4baa..6d346c0bdd5c 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -58,83 +58,62 @@ To obtain a meaningful ranking by importance for linear models, the features nee be on the same scale (which is also recommended when using L1 or L2 regularization). } \examples{ - -# binomial classification using "gbtree": -data(agaricus.train, package = "xgboost") - -bst <- xgb.train( - data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), - nrounds = 2, - params = xgb.params( - max_depth = 2, - nthread = 2, - objective = "binary:logistic" - ) +# binary classification using "gbtree": +data("ToothGrowth") +x <- ToothGrowth[, c("len", "dose")] +y <- ToothGrowth$supp +model_tree_binary <- xgboost( + x, y, + nrounds = 5L, + nthreads = 1L, + booster = "gbtree", + max_depth = 2L ) - -xgb.importance(model = bst) - -# binomial classification using "gblinear": -bst <- xgb.train( - data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), - nrounds = 20, - params = xgb.params( - booster = "gblinear", - learning_rate = 0.3, - nthread = 1, - objective = "binary:logistic" - ) +xgb.importance(model_tree_binary) + +# binary classification using "gblinear": +model_tree_linear <- xgboost( + x, y, + nrounds = 5L, + nthreads = 1L, + booster = "gblinear", + learning_rate = 0.3 ) - -xgb.importance(model = bst) - -# multiclass classification using "gbtree": -nclass <- 3 -nrounds <- 10 -mbst <- xgb.train( - data = xgb.DMatrix( - as.matrix(iris[, -5]), - label = as.numeric(iris$Species) - 1 - ), - nrounds = nrounds, - params = xgb.params( - max_depth = 3, - nthread = 2, - objective = "multi:softprob", - num_class = nclass - ) +xgb.importance(model_tree_linear) + +# multi-class classification using "gbtree": +data("iris") +x <- iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")] +y <- iris$Species +model_tree_multi <- xgboost( + x, y, + nrounds = 5L, + nthreads = 1L, + booster = "gbtree", + max_depth = 3 ) - # all classes clumped together: -xgb.importance(model = mbst) - +xgb.importance(model_tree_multi) # inspect importances separately for each class: +num_classes <- 3L +nrounds <- 5L xgb.importance( - model = mbst, trees = seq(from = 1, by = nclass, length.out = nrounds) + model_tree_multi, trees = seq(from = 1, by = num_classes, length.out = nrounds) ) xgb.importance( - model = mbst, trees = seq(from = 2, by = nclass, length.out = nrounds) + model_tree_multi, trees = seq(from = 2, by = num_classes, length.out = nrounds) ) xgb.importance( - model = mbst, trees = seq(from = 3, by = nclass, length.out = nrounds) + model_tree_multi, trees = seq(from = 3, by = num_classes, length.out = nrounds) ) -# multiclass classification using "gblinear": -mbst <- xgb.train( - data = xgb.DMatrix( - scale(as.matrix(iris[, -5])), - label = as.numeric(iris$Species) - 1 - ), - nrounds = 15, - params = xgb.params( - booster = "gblinear", - learning_rate = 0.2, - nthread = 1, - objective = "multi:softprob", - num_class = nclass - ) +# multi-class classification using "gblinear": +model_linear_multi <- xgboost( + x, y, + nrounds = 5L, + nthreads = 1L, + booster = "gblinear", + learning_rate = 0.2 ) - -xgb.importance(model = mbst) - +xgb.importance(model_linear_multi) } diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd index e8729b7ca9be..1e1827e42384 100644 --- a/R-package/man/xgb.plot.deepness.Rd +++ b/R-package/man/xgb.plot.deepness.Rd @@ -74,27 +74,24 @@ nthread <- 2 data.table::setDTthreads(nthread) ## Change max_depth to a higher number to get a more significant result -bst <- xgb.train( - data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), +model <- xgboost( + agaricus.train$data, factor(agaricus.train$label), nrounds = 50, - params = xgb.params( - max_depth = 6, - nthread = nthread, - objective = "binary:logistic", - subsample = 0.5, - min_child_weight = 2 - ) + max_depth = 6, + nthreads = nthread, + subsample = 0.5, + min_child_weight = 2 ) -xgb.plot.deepness(bst) -xgb.ggplot.deepness(bst) +xgb.plot.deepness(model) +xgb.ggplot.deepness(model) xgb.plot.deepness( - bst, which = "max.depth", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2 + model, which = "max.depth", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2 ) xgb.plot.deepness( - bst, which = "med.weight", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2 + model, which = "med.weight", pch = 16, col = rgb(0, 0, 1, 0.3), cex = 2 ) } diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index 78ed27a22238..54bf073b6e36 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -88,17 +88,14 @@ data(agaricus.train) nthread <- 2 data.table::setDTthreads(nthread) -bst <- xgb.train( - data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), +model <- xgboost( + agaricus.train$data, factor(agaricus.train$label), nrounds = 2, - params = xgb.params( - max_depth = 3, - nthread = nthread, - objective = "binary:logistic" - ) + max_depth = 3, + nthreads = nthread ) -importance_matrix <- xgb.importance(colnames(agaricus.train$data), model = bst) +importance_matrix <- xgb.importance(model) xgb.plot.importance( importance_matrix, rel_to_first = TRUE, xlab = "Relative importance" ) diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd index a4421d239fdd..5ca8cd267e3f 100644 --- a/R-package/man/xgb.plot.multi.trees.Rd +++ b/R-package/man/xgb.plot.multi.trees.Rd @@ -71,26 +71,23 @@ data(agaricus.train, package = "xgboost") nthread <- 2 data.table::setDTthreads(nthread) -bst <- xgb.train( - data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), +model <- xgboost( + agaricus.train$data, factor(agaricus.train$label), nrounds = 30, - verbose = 0, - params = xgb.params( - max_depth = 15, - learning_rate = 1, - nthread = nthread, - objective = "binary:logistic", - min_child_weight = 50 - ) + verbosity = 0L, + nthreads = nthread, + max_depth = 15, + learning_rate = 1, + min_child_weight = 50 ) -p <- xgb.plot.multi.trees(model = bst, features_keep = 3) +p <- xgb.plot.multi.trees(model, features_keep = 3) print(p) # Below is an example of how to save this plot to a file. if (require("DiagrammeR") && require("DiagrammeRsvg") && require("rsvg")) { fname <- file.path(tempdir(), "tree.pdf") - gr <- xgb.plot.multi.trees(bst, features_keep = 3, render = FALSE) + gr <- xgb.plot.multi.trees(model, features_keep = 3, render = FALSE) export_graph(gr, fname, width = 1500, height = 600) } } diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd index a42f6640f315..2d9563117528 100644 --- a/R-package/man/xgb.plot.shap.Rd +++ b/R-package/man/xgb.plot.shap.Rd @@ -134,51 +134,44 @@ nthread <- 1 data.table::setDTthreads(nthread) nrounds <- 20 -bst <- xgb.train( - data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), +model_binary <- xgboost( + agaricus.train$data, factor(agaricus.train$label), nrounds = nrounds, - verbose = 0, - params = xgb.params( - learning_rate = 0.1, - max_depth = 3, - subsample = 0.5, - objective = "binary:logistic", - nthread = nthread - ) + verbosity = 0L, + learning_rate = 0.1, + max_depth = 3L, + subsample = 0.5, + nthreads = nthread ) -xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") +xgb.plot.shap(agaricus.test$data, model = model_binary, features = "odor=none") -contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) -xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3) +contr <- predict(model_binary, agaricus.test$data, type = "contrib") +xgb.plot.shap(agaricus.test$data, contr, model = model_binary, top_n = 12, n_col = 3) # Summary plot -xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) +xgb.ggplot.shap.summary(agaricus.test$data, contr, model = model_binary, top_n = 12) # Multiclass example - plots for each class separately: -nclass <- 3 x <- as.matrix(iris[, -5]) set.seed(123) is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values -mbst <- xgb.train( - data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1), +model_multiclass <- xgboost( + x, iris$Species, nrounds = nrounds, - verbose = 0, - params = xgb.params( - max_depth = 2, - subsample = 0.5, - nthread = nthread, - objective = "multi:softprob", - num_class = nclass - ) + verbosity = 0, + max_depth = 2, + subsample = 0.5, + nthreads = nthread ) +nclass <- 3 trees0 <- seq(from = 1, by = nclass, length.out = nrounds) col <- rgb(0, 0, 1, 0.5) xgb.plot.shap( x, - model = mbst, + model = model_multiclass, trees = trees0, target_class = 0, top_n = 4, @@ -190,7 +183,7 @@ xgb.plot.shap( xgb.plot.shap( x, - model = mbst, + model = model_multiclass, trees = trees0 + 1, target_class = 1, top_n = 4, @@ -202,7 +195,7 @@ xgb.plot.shap( xgb.plot.shap( x, - model = mbst, + model = model_multiclass, trees = trees0 + 2, target_class = 2, top_n = 4, @@ -213,7 +206,7 @@ xgb.plot.shap( ) # Summary plot -xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) +xgb.ggplot.shap.summary(x, model = model_multiclass, target_class = 0, top_n = 4) } \references{ diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 00ed6d24864e..077284583958 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -64,25 +64,23 @@ This function uses \href{https://www.graphviz.org/}{GraphViz} as DiagrammeR backend. } \examples{ -data(agaricus.train, package = "xgboost") - -bst <- xgb.train( - data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), - nrounds = 2, - params = xgb.params( - max_depth = 3, - nthread = 2, - objective = "binary:logistic" - ) +data("ToothGrowth") +x <- ToothGrowth[, c("len", "dose")] +y <- ToothGrowth$supp +model <- xgboost( + x, y, + nthreads = 1L, + nrounds = 3L, + max_depth = 3L ) # plot the first tree -xgb.plot.tree(model = bst, tree_idx = 1) +xgb.plot.tree(model, tree_idx = 1) # Below is an example of how to save this plot to a file. if (require("DiagrammeR") && require("htmlwidgets")) { fname <- file.path(tempdir(), "plot.html'") - gr <- xgb.plot.tree(bst, tree_idx = 1) + gr <- xgb.plot.tree(model, tree_idx = 1) htmlwidgets::saveWidget(gr, fname) } } diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 058090e1ad1f..570be5ee4638 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -222,7 +222,7 @@ Fits an XGBoost model (boosted decision tree ensemble) to given x/y data. See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{Introduction to Boosted Trees} for a longer explanation of what XGBoost does. -This function is intended to provide a more user-friendly interface for XGBoost that follows +This function is intended to provide a user-friendly interface for XGBoost that follows R's conventions for model fitting and predictions, but which doesn't expose all of the possible functionalities of the core XGBoost library. @@ -244,7 +244,29 @@ predict(model_regression, mtcars, validate_features = TRUE) # Task objective is determined automatically according to the type of 'y' data(iris) model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5) -predict(model_classif, iris, validate_features = TRUE) +predict(model_classif, iris[1:10,]) +predict(model_classif, iris[1:10,], type = "class") + +# Can nevertheless choose a non-default objective if needed +model_poisson <- xgboost( + mtcars[, -1], mtcars$mpg, + objective = "count:poisson", + nthreads = 1, + nrounds = 3 +) + +# Can calculate evaluation metrics during boosting rounds +data(ToothGrowth) +xgboost( + ToothGrowth[, c("len", "dose")], + ToothGrowth$supp, + eval_metric = c("auc", "logloss"), + eval_set = 0.2, + monitor_training = TRUE, + verbosity = 1, + nthreads = 1, + nrounds = 3 +) } \references{ \itemize{ diff --git a/R-package/vignettes/vignette.css b/R-package/vignettes/vignette.css deleted file mode 100644 index 59dfcd85c9df..000000000000 --- a/R-package/vignettes/vignette.css +++ /dev/null @@ -1,225 +0,0 @@ -body { - margin: 0 auto; - background-color: white; - -/* --------- FONT FAMILY -------- - following are some optional font families. Usually a family - is safer to choose than a specific font, - which may not be on the users computer */ -/ font-family:Georgia, Palatino, serif; - font-family: "Open Sans", "Book Antiqua", Palatino, serif; -/ font-family:Arial, Helvetica, sans-serif; -/ font-family:Tahoma, Verdana, Geneva, sans-serif; -/ font-family:Courier, monospace; -/ font-family:"Times New Roman", Times, serif; - -/* -------------- COLOR OPTIONS ------------ - following are additional color options for base font - you could uncomment another one to easily change the base color - or add one to a specific element style below */ - color: #333333; /* dark gray not black */ -/ color: #000000; /* black */ -/ color: #666666; /* medium gray black */ -/ color: #E3E3E3; /* very light gray */ -/ color: white; - - line-height: 100%; - max-width: 800px; - padding: 10px; - font-size: 17px; - text-align: justify; - text-justify: inter-word; -} - - -p { - line-height: 150%; -/ max-width: 540px; - max-width: 960px; - margin-bottom: 5px; - font-weight: 400; -/ color: #333333 -} - - -h1, h2, h3, h4, h5, h6 { - font-weight: 400; - margin-top: 35px; - margin-bottom: 15px; - padding-top: 10px; -} - -h1 { - margin-top: 70px; - color: #606AAA; - font-size:230%; - font-variant:small-caps; - padding-bottom:20px; - width:100%; - border-bottom:1px solid #606AAA; -} - -h2 { - font-size:160%; -} - -h3 { - font-size:130%; -} - -h4 { - font-size:120%; - font-variant:small-caps; -} - -h5 { - font-size:120%; -} - -h6 { - font-size:120%; - font-variant:small-caps; -} - -a { - color: #606AAA; - margin: 0; - padding: 0; - vertical-align: baseline; -} - -a:hover { - text-decoration: blink; - color: green; -} - -a:visited { - color: gray; -} - -ul, ol { - padding: 0; - margin: 0px 0px 0px 50px; -} -ul { - list-style-type: square; - list-style-position: inside; - -} - -li { - line-height:150% -} - -li ul, li ul { - margin-left: 24px; -} - -pre { - padding: 0px 10px; - max-width: 800px; - white-space: pre-wrap; -} - -code { - font-family: Consolas, Monaco, Andale Mono, monospace, courrier new; - line-height: 1.5; - font-size: 15px; - background: #F8F8F8; - border-radius: 4px; - padding: 5px; - display: inline-block; - max-width: 800px; - white-space: pre-wrap; -} - - -li code, p code { - background: #CDCDCD; - color: #606AAA; - padding: 0px 5px 0px 5px; -} - -code.r, code.cpp { - display: block; - word-wrap: break-word; - border: 1px solid #606AAA; -} - -aside { - display: block; - float: right; - width: 390px; -} - -blockquote { - border-left:.5em solid #606AAA; - background: #F8F8F8; - padding: 0em 1em 0em 1em; - margin-left:10px; - max-width: 500px; -} - -blockquote cite { - line-height:10px; - color:#bfbfbf; -} - -blockquote cite:before { - /content: '\2014 \00A0'; -} - -blockquote p, blockquote li { - color: #666; -} -hr { -/ width: 540px; - text-align: left; - margin: 0 auto 0 0; - color: #999; -} - - -/* table */ - -table { - width: 100%; - border-top: 1px solid #919699; - border-left: 1px solid #919699; - border-spacing: 0; -} - -table th { - padding: 4px 8px 4px 8px; - text-align: center; - color: white; - background: #606AAA; - border-bottom: 1px solid #919699; - border-right: 1px solid #919699; -} -table th p { - font-weight: bold; - margin-bottom: 0px; -} - -table td { - padding: 8px; - vertical-align: top; - border-bottom: 1px solid #919699; - border-right: 1px solid #919699; -} - -table td:last-child { - /background: lightgray; - text-align: right; -} - -table td p { - margin-bottom: 0px; -} -table td p + p { - margin-top: 5px; -} -table td p + p + p { - margin-top: 5px; -} diff --git a/R-package/vignettes/xgboost.bib b/R-package/vignettes/xgboost.bib deleted file mode 100644 index 908be3136459..000000000000 --- a/R-package/vignettes/xgboost.bib +++ /dev/null @@ -1,28 +0,0 @@ -@article{friedman2001greedy, - title={Greedy function approximation: a gradient boosting machine}, - author={Friedman, Jerome H}, - journal={Annals of Statistics}, - pages={1189--1232}, - year={2001}, - publisher={JSTOR} -} - -@article{friedman2000additive, - title={Additive logistic regression: a statistical view of boosting (with discussion and a rejoinder by the authors)}, - author={Friedman, Jerome and Hastie, Trevor and Tibshirani, Robert and others}, - journal={The annals of statistics}, - volume={28}, - number={2}, - pages={337--407}, - year={2000}, - publisher={Institute of Mathematical Statistics} -} - -@misc{ - Bache+Lichman:2013 , - author = "K. Bache and M. Lichman", - year = "2013", - title = "{UCI} Machine Learning Repository", - url = "https://archive.ics.uci.edu/", - institution = "University of California, Irvine, School of Information and Computer Sciences" -} diff --git a/R-package/vignettes/xgboost_introduction.qmd b/R-package/vignettes/xgboost_introduction.qmd new file mode 100644 index 000000000000..a9622bab50b5 --- /dev/null +++ b/R-package/vignettes/xgboost_introduction.qmd @@ -0,0 +1,196 @@ +--- +title: "XGBoost for R introduction" +vignette: > + %\VignetteEncoding{UTF-8} + %\VignetteIndexEntry{XGBoost for R introduction} + %\VignetteEngine{quarto::html} +format: + html: + embed-resources: true + theme: yeti +highlight-style: pygments +jupyter: ir +--- + +# Introduction + +**XGBoost** is an optimized distributed gradient boosting library designed to be highly **efficient**, **flexible** and **portable**. It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples. + +For an introduction to the concept of gradient boosting, see the tutorial [Introduction to Boosted Trees](https://xgboost.readthedocs.io/en/stable/tutorials/model.html) in XGBoost's online docs. + +For more details about XGBoost's features and usage, see the [online documentation](https://xgboost.readthedocs.io/en/stable/) which contains more tutorials, examples, and details. + +This short vignette outlines the basic usage of the R interface for XGBoost, assuming the reader has some familiarity with the underlying concepts behind statistical modeling with gradient-boosted decision trees. + +# Building a predictive model + +At its core, XGBoost consists of a C++ library which offers bindings for different programming languages, including R. The R package for XGBoost provides an idiomatic interface similar to those of other statistical modeling packages using and x/y design, as well as a lower-level interface that interacts more directly with the underlying core library and which is similar to those of other language bindings like Python, plus various helpers to interact with its model objects such as by plotting their feature importances or converting them to other formats. + +The main function of interest is `xgboost(x, y, ...)`, which calls the XGBoost model building procedure on observed data of covariates/features/predictors "x", and a response variable "y" - it should feel familiar to users of packages like `glmnet` or `ncvreg`: + +```{r} +library(xgboost) +data(ToothGrowth) + +y <- ToothGrowth$supp # the response which we want to model/predict +x <- ToothGrowth[, c("len", "dose")] # the features from which we want to predct it +model <- xgboost(x, y, nthreads = 1, nrounds = 2) +model +``` + +In this case, the "y" response variable that was supplied is a "factor" type with two classes ("OJ" and "VC") - hence, XGBoost builds a binary classification model for it based on the features "x", by finding a maximum likelihood estimate (similar to the `faimily="binomial"` model from R's `glm` function) through rule buckets obtained from the sum of two decision trees (from `nrounds=2`), from which we can then predict probabilities, log-odds, class with highest likelihood, among others: + +```{r} +predict(model, x[1:6, ], type = "response") # probabilities for y's last level ("VC") +predict(model, x[1:6, ], type = "raw") # log-odds +predict(model, x[1:6, ], type = "class") # class with highest probability +``` + +Compared to R's `glm` function which follows the concepts of "families" and "links" from GLM theory to fit models for different kinds of response distributions, XGBoost follows the simpler concept of "objectives" which mix both of them into one, and which just like `glm`, allow modeling very different kinds of response distributions (e.g. discrete choices, real-valued numbers, counts, censored measurements, etc.) through a common framework. + +XGBoost will automatically determine a suitable objective for the response given its object class (can pass factors for classification, numeric vectors for regression, `Surv` objects from the `survival` package for survival, etc. - see `?xgboost` for more details), but this can be controlled manually through an `objective` parameter based the kind of model that is desired: + +```{r} +data(mtcars) + +y <- mtcars$mpg +x <- mtcars[, -1] +model_gaussian <- xgboost(x, y, nthreads = 1, nrounds = 2) # default is squared loss (Gaussian) +model_poisson <- xgboost(x, y, objective = "count:poisson", nthreads = 1, nrounds = 2) +model_abserr <- xgboost(x, y, objective = "reg:absoluteerror", nthreads = 1, nrounds = 2) +``` + +_Note: the objective must match with the type of the "y" response variable - for example, classification objectives for discrete choices require "factor" types, while regression models for real-valued data require "numeric" types._ + +# Model parameters + +XGBoost models allow a large degree of control over how they are built. By their nature, gradient-boosted decision tree ensembles are able to capture very complex patterns between features in the data and a response variable, which also means they can suffer from overfitting if not controlled appropirately. + +For best results, one needs to find suitable parameters for the data being modeled. Note that XGBoost does not adjust its default hyperparameters based on the data, and different datasets will require vastly different hyperparameters for optimal predictive performance. + +For example, for a small dataset like "TootGrowth" which has only two features and 60 observations, the defaults from XGBoost are an overkill which lead to severe overfitting - for such data, one might want to have smaller trees (i.e. more convervative decision rules, capturing simpler patterns) and fewer of them, for example. + +Parameters can be controlled by passing additional arguments to `xgboost()`. See `?xgb.params` for details about what parameters are available to control. + +```{r} +y <- ToothGrowth$supp +x <- ToothGrowth[, c("len", "dose")] +model_conservative <- xgboost( + x, y, nthreads = 1, + nrounds = 5, + max_depth = 2, + reg_lambda = 0.5, + learning_rate = 0.15 +) +pred_conservative <- predict( + model_conservative, + x +) +pred_conservative[1:6] # probabilities are all closer to 0.5 now +``` + +XGBoost also allows the possibility of calculating evaluation metrics for model quality over boosting rounds, with a wide variety of built-in metrics available to use. It's possible to automatically set aside a fraction of the data to use as evaluation set, from which one can then visually monitor progress and overfitting: + +```{r} +xgboost( + x, y, nthreads = 1, + eval_set = 0.2, + monitor_training = TRUE, + verbosity = 1, + eval_metric = c("auc", "logloss"), + nrounds = 5, + max_depth = 2, + reg_lambda = 0.5, + learning_rate = 0.15 +) +``` + +# Examining model objects + +XGBoost model objects for the most part consist of a pointer to a C++ object where most of the information is held and which is interfaced through the utility functions and methods in the package, but also contains some R attributes that can be retrieved (and new ones added) through `attributes()`: + +```{r} +attributes(model) +``` + +In addition to R attributes (which can be arbitrary R objects), it may also keep some standardized C-level attributes that one can access and modify (but which can only be JSON-format): + +```{r} +xgb.attributes(model) +``` + +(they are empty for this model) + +... but usually, when it comes to getting something out of a model object, one would typically want to do this through the built-in utility functions. Some examples: + +```{r} +xgb.importance(model) +``` + +```{r} +xgb.model.dt.tree(model) +``` + +# Other features + +XGBoost supports many additional features on top of its traditional gradient-boosting framework, including, among others: + +* Building decision tree models with characteristics such as per-feature monotonicity constraints or interaction constraints. +* Calculating feature contributions in individual predictions. +* Using custom objectives and custom evaluation metrics. +* Fitting linear models. +* Fitting models on GPUs and/or on data that doesn't fit in RAM ("external memory"). + +See the [online documentation](https://xgboost.readthedocs.io/en/stable/index.html) - particularly the [tutorials section](https://xgboost.readthedocs.io/en/stable/tutorials/index.html) - for a glimpse over further functionalities that XGBoost offers. + +# The low-level interface + +In addition to the `xgboost(x, y, ...)` function, XGBoost also provides a lower-level interface for creating model objects through the function `xgb.train()`, which resembles the same `xgb.train` functions in other language bindings of XGBoost. + +This `xgb.train()` interface exposes additional functionalities (such as user-supplied callbacks or external-memory data support) and performs fewer data validations and castings compared to the `xgboost()` function interface. + +Some key differences between the two interfaces: + +* Unlike `xgboost()` which takes R objects such as `matrix` or `data.frame` as inputs, the function `xgb.train()` uses XGBoost's own data container called "DMatrix", which can be created from R objects through the function `xgb.DMatrix()`. Note that there are other "DMatrix" constructors too, such as "xgb.QuantileDMatrix()", which might be more beneficial for some use-cases. +* A "DMatrix" object may contain a mixture of features/covariates, the response variable, observation weights, base margins, among others; and unlike `xgboost()`, requires its inputs to have already been encoded into the representation that XGBoost uses behind the scenes - for example, while `xgboost()` may take a `factor` object as "y", `xgb.DMatrix()` requires instead a binary response variable to be passed as a vector of zeros and ones. +* Hyperparameters are passed as function arguments in `xgboost()`, while they are passed as a named list to `xgb.train()`. +* The `xgb.train()` interface keeps less metadata about its inputs - for example, it will not add levels of factors as column names to estimated probabilities when calling `predict`. + +Example usage of `xgb.train()`: + +```{r} +data("agaricus.train") +dmatrix <- xgb.DMatrix( + data = agaricus.train$data, # a sparse CSC matrix ('dgCMatrix') + label = agaricus.train$label # zeros and ones +) +booster <- xgb.train( + data = dmatrix, + nrounds = 10, + params = list( + objective = "binary:logistic", + nthreads = 1, + max_depth = 3 + ) +) + +data("agaricus.test") +dmatrix_test <- xgb.DMatrix(agaricus.test$data) +pred_prob <- predict(booster, dmatrix_test) +pred_raw <- predict(booster, dmatrix_test, outputmargin = TRUE) +``` + +Model objects produced by `xgb.train()` have class `xgb.Booster`, while model objects produced by `xgboost()` have class `xgboost`, which is a subclass of `xgb.Booster`. Their `predict` methods also take different arguments - for example, `predict.xgboost` has a `type` parameter, while `predict.xgb.Booster` controls this through binary arguments - but as `xgboost` is a subclass of `xgb.Booster`, methods for `xgb.Booster` can be called on `xgboost` objects if needed. + +Utility functions in the XGBoost R package will work with both model classes - for example: + +```{r} +xgb.importance(model) +xgb.importance(booster) +``` + +While `xgboost()` aims to provide a user-friendly interface, there are still many situations where one should prefer the `xgb.train()` interface - for example: + +* For latency-sensitive applications (e.g. when serving models in real time), `xgb.train()` will have a speed advantage, as it performs fewer validations, conversions, and post-processings with metadata. +* If you are developing an R package that depends on XGBoost, `xgb.train()` will provide a more stable interface (less subject to changes) and will have lower time/memory overhead. +* If you need functionalities that are not exposed by the `xgboost()` interface - for example, if your dataset does not fit into the computer's RAM, it's still possible to construct a DMatrix from it if the data is loaded in batches through `xgb.ExtMemDMatrix()`. diff --git a/doc/R-package/discoverYourData.md b/doc/R-package/discoverYourData.md deleted file mode 100644 index 1c51364596b5..000000000000 --- a/doc/R-package/discoverYourData.md +++ /dev/null @@ -1,475 +0,0 @@ -# Understand your dataset with XGBoost - -## Introduction - -The purpose of this vignette is to show you how to use **XGBoost** to -discover and understand your own dataset better. - -This vignette is not about predicting anything (see [XGBoost -presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). -We will explain how to use **XGBoost** to highlight the *link* between -the *features* of your data and the *outcome*. - -Package loading: - - require(xgboost) - require(Matrix) - require(data.table) - if (!require('vcd')) { - install.packages('vcd') - } - -> **VCD** package is used for one of its embedded dataset only. - -## Preparation of the dataset - -### Numeric v.s. categorical variables - -**XGBoost** manages only `numeric` vectors. - -What to do when you have *categorical* data? - -A *categorical* variable has a fixed number of different values. For -instance, if a variable called *Colour* can have only one of these three -values, *red*, *blue* or *green*, then *Colour* is a *categorical* -variable. - -> In **R**, a *categorical* variable is called `factor`. -> -> Type `?factor` in the console for more information. - -To answer the question above we will convert *categorical* variables to -`numeric` ones. - -### Conversion from categorical to numeric variables - -#### Looking at the raw data - -+In this Vignette we will see how to transform a *dense* `data.frame` -(*dense* = the majority of the matrix is non-zero) with *categorical* -variables to a very *sparse* matrix (*sparse* = lots of zero entries in -the matrix) of `numeric` features. - -The method we are going to see is usually called [one-hot -encoding](https://en.wikipedia.org/wiki/One-hot). - -The first step is to load the `Arthritis` dataset in memory and wrap it -with the `data.table` package. - - data(Arthritis) - df <- data.table(Arthritis, keep.rownames = FALSE) - -> `data.table` is 100% compliant with **R** `data.frame` but its syntax -> is more consistent and its performance for large dataset is [best in -> class](https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) -> (`dplyr` from **R** and `Pandas` from **Python** -> [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). -> Some parts of **XGBoost’s** **R** package use `data.table`. - -The first thing we want to do is to have a look to the first few lines -of the `data.table`: - - head(df) - - ## ID Treatment Sex Age Improved - ## 1: 57 Treated Male 27 Some - ## 2: 46 Treated Male 29 None - ## 3: 77 Treated Male 30 None - ## 4: 17 Treated Male 32 Marked - ## 5: 36 Treated Male 46 Marked - ## 6: 23 Treated Male 58 Marked - -Now we will check the format of each column. - - str(df) - - ## Classes 'data.table' and 'data.frame': 84 obs. of 5 variables: - ## $ ID : int 57 46 77 17 36 23 75 39 33 55 ... - ## $ Treatment: Factor w/ 2 levels "Placebo","Treated": 2 2 2 2 2 2 2 2 2 2 ... - ## $ Sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ... - ## $ Age : int 27 29 30 32 46 58 59 59 63 63 ... - ## $ Improved : Ord.factor w/ 3 levels "None"<"Some"<..: 2 1 1 3 3 3 1 3 1 1 ... - ## - attr(*, ".internal.selfref")= - -2 columns have `factor` type, one has `ordinal` type. - -> `ordinal` variable : -> -> - can take a limited number of values (like `factor`) ; -> - these values are ordered (unlike `factor`). Here these ordered -> values are: `Marked > Some > None` - -#### Creation of new features based on old ones - -We will add some new *categorical* features to see if it helps. - -##### Grouping per 10 years - -For the first features we create groups of age by rounding the real age. - -Note that we transform it to `factor` so the algorithm treats these age -groups as independent values. - -Therefore, 20 is not closer to 30 than 60. In other words, the distance -between ages is lost in this transformation. - - head(df[, AgeDiscret := as.factor(round(Age / 10, 0))]) - - ## ID Treatment Sex Age Improved AgeDiscret - ## 1: 57 Treated Male 27 Some 3 - ## 2: 46 Treated Male 29 None 3 - ## 3: 77 Treated Male 30 None 3 - ## 4: 17 Treated Male 32 Marked 3 - ## 5: 36 Treated Male 46 Marked 5 - ## 6: 23 Treated Male 58 Marked 6 - -##### Randomly split into two groups - -The following is an even stronger simplification of the real age with an -arbitrary split at 30 years old. I choose this value **based on -nothing**. We will see later if simplifying the information based on -arbitrary values is a good strategy (you may already have an idea of how -well it will work…). - - head(df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]) - - ## ID Treatment Sex Age Improved AgeDiscret AgeCat - ## 1: 57 Treated Male 27 Some 3 Young - ## 2: 46 Treated Male 29 None 3 Young - ## 3: 77 Treated Male 30 None 3 Young - ## 4: 17 Treated Male 32 Marked 3 Old - ## 5: 36 Treated Male 46 Marked 5 Old - ## 6: 23 Treated Male 58 Marked 6 Old - -##### Risks in adding correlated features - -These new features are highly correlated to the `Age` feature because -they are simple transformations of this feature. - -For many machine learning algorithms, using correlated features is not a -good idea. It may sometimes make prediction less accurate, and most of -the time make interpretation of the model almost impossible. GLM, for -instance, assumes that the features are uncorrelated. - -Fortunately, decision tree algorithms (including boosted trees) are very -robust to these features. Therefore we don’t have to do anything to -manage this situation. - -##### Cleaning data - -We remove ID as there is nothing to learn from this feature (it would -just add some noise). - - df[, ID := NULL] - -We will list the different values for the column `Treatment`: - - levels(df[, Treatment]) - - ## [1] "Placebo" "Treated" - -#### Encoding categorical features - -Next step, we will transform the categorical data to dummy variables. -Several encoding methods exist, e.g., [one-hot -encoding](https://en.wikipedia.org/wiki/One-hot) is a common approach. -We will use the [dummy contrast -coding](https://stats.oarc.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/) -which is popular because it produces “full rank” encoding (also see -[this blog post by Max -Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)). - -The purpose is to transform each value of each *categorical* feature -into a *binary* feature `{0, 1}`. - -For example, the column `Treatment` will be replaced by two columns, -`TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be -*binary*. Therefore, an observation which has the value `Placebo` in -column `Treatment` before the transformation will have the value `1` in -the new column `TreatmentPlacebo` and the value `0` in the new column -`TreatmentTreated` after the transformation. The column -`TreatmentPlacebo` will disappear during the contrast encoding, as it -would be absorbed into a common constant intercept column. - -Column `Improved` is excluded because it will be our `label` column, the -one we want to predict. - - sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[, -1] - head(sparse_matrix) - - ## 6 x 9 sparse Matrix of class "dgCMatrix" - ## TreatmentTreated SexMale Age AgeDiscret3 AgeDiscret4 AgeDiscret5 AgeDiscret6 - ## 1 1 1 27 1 . . . - ## 2 1 1 29 1 . . . - ## 3 1 1 30 1 . . . - ## 4 1 1 32 1 . . . - ## 5 1 1 46 . . 1 . - ## 6 1 1 58 . . . 1 - ## AgeDiscret7 AgeCatYoung - ## 1 . 1 - ## 2 . 1 - ## 3 . 1 - ## 4 . . - ## 5 . . - ## 6 . . - -> Formula `Improved ~ .` used above means transform all *categorical* -> features but column `Improved` to binary values. The `-1` column -> selection removes the intercept column which is full of `1` (this -> column is generated by the conversion). For more information, you can -> type `?sparse.model.matrix` in the console. - -Create the output `numeric` vector (not as a sparse `Matrix`): - - output_vector <- df[, Improved] == "Marked" - -1. set `Y` vector to `0`; -2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ; -3. return `Y` vector. - -## Build the model - -The code below is very usual. For more information, you can look at the -documentation of `xgboost` function (or at the vignette [XGBoost -presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). - - bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4, - eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic") - - ## [1] train-logloss:0.485466 - ## [2] train-logloss:0.438534 - ## [3] train-logloss:0.412250 - ## [4] train-logloss:0.395828 - ## [5] train-logloss:0.384264 - ## [6] train-logloss:0.374028 - ## [7] train-logloss:0.365005 - ## [8] train-logloss:0.351233 - ## [9] train-logloss:0.341678 - ## [10] train-logloss:0.334465 - -You can see some `train-logloss: 0.XXXXX` lines followed by a number. It -decreases. Each line shows how well the model explains the data. Lower -is better. - -A small value for training error may be a symptom of -[overfitting](https://en.wikipedia.org/wiki/Overfitting), meaning the -model will not accurately predict unseen values. - -## Feature importance - -## Measure feature importance - -### Build the feature importance data.table - -Remember, each binary column corresponds to a single value of one of -*categorical* features. - - importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst) - head(importance) - - ## Feature Gain Cover Frequency - ## 1: Age 0.622031769 0.67251696 0.67241379 - ## 2: TreatmentTreated 0.285750540 0.11916651 0.10344828 - ## 3: SexMale 0.048744022 0.04522028 0.08620690 - ## 4: AgeDiscret6 0.016604639 0.04784639 0.05172414 - ## 5: AgeDiscret3 0.016373781 0.08028951 0.05172414 - ## 6: AgeDiscret4 0.009270557 0.02858801 0.01724138 - -> The column `Gain` provides the information we are looking for. -> -> As you can see, features are classified by `Gain`. - -`Gain` is the improvement in accuracy brought by a feature to the -branches it is on. The idea is that before adding a new split on a -feature X to the branch there were some wrongly classified elements; -after adding the split on this feature, there are two new branches, and -each of these branches is more accurate (one branch saying if your -observation is on this branch then it should be classified as `1`, and -the other branch saying the exact opposite). - -`Cover` is related to the second order derivative (or Hessian) of the -loss function with respect to a particular variable; thus, a large value -indicates a variable has a large potential impact on the loss function -and so is important. - -`Frequency` is a simpler way to measure the `Gain`. It just counts the -number of times a feature is used in all generated trees. You should not -use it (unless you know why you want to use it). - -### Plotting the feature importance - -All these things are nice, but it would be even better to plot the -results. - - xgb.plot.importance(importance_matrix = importance) - - - -Running this line of code, you should get a bar chart showing the -importance of the 6 features (containing the same data as the output we -saw earlier, but displaying it visually for easier consumption). Note -that `xgb.ggplot.importance` is also available for all the ggplot2 fans! - -> Depending of the dataset and the learning parameters you may have more -> than two clusters. Default value is to limit them to `10`, but you can -> increase this limit. Look at the function documentation for more -> information. - -According to the plot above, the most important features in this dataset -to predict if the treatment will work are : - -- An individual’s age; -- Having received a placebo or not; -- Gender; -- Our generated feature AgeDiscret. We can see that its contribution - is very low. - -### Do these results make sense? - -Let’s check some **Chi2** between each of these features and the label. - -Higher **Chi2** means better correlation. - - c2 <- chisq.test(df$Age, output_vector) - print(c2) - - ## - ## Pearson's Chi-squared test - ## - ## data: df$Age and output_vector - ## X-squared = 35.475, df = 35, p-value = 0.4458 - -The Pearson correlation between Age and illness disappearing is -**35.47**. - - c2 <- chisq.test(df$AgeDiscret, output_vector) - print(c2) - - ## - ## Pearson's Chi-squared test - ## - ## data: df$AgeDiscret and output_vector - ## X-squared = 8.2554, df = 5, p-value = 0.1427 - -Our first simplification of Age gives a Pearson correlation of **8.26**. - - c2 <- chisq.test(df$AgeCat, output_vector) - print(c2) - - ## - ## Pearson's Chi-squared test with Yates' continuity correction - ## - ## data: df$AgeCat and output_vector - ## X-squared = 2.3571, df = 1, p-value = 0.1247 - -The perfectly random split we did between young and old at 30 years old -has a low correlation of **2.36**. This suggests that, for the -particular illness we are studying, the age at which someone is -vulnerable to this disease is likely very different from 30. - -Moral of the story: don’t let your *gut* lower the quality of your -model. - -In *data science*, there is the word *science* :-) - -## Conclusion - -As you can see, in general *destroying information by simplifying it -won’t improve your model*. **Chi2** just demonstrates that. - -But in more complex cases, creating a new feature from an existing one -may help the algorithm and improve the model. - -+The case studied here is not complex enough to show that. Check [Kaggle -website](https://www.kaggle.com/) for some challenging datasets. - -Moreover, you can see that even if we have added some new features which -are not very useful/highly correlated with other features, the boosting -tree algorithm was still able to choose the best one (which in this case -is the Age). - -Linear models may not perform as well. - -## Special Note: What about Random Forests™? - -As you may know, the [Random -Forests](https://en.wikipedia.org/wiki/Random_forest) algorithm is -cousin with boosting and both are part of the [ensemble -learning](https://en.wikipedia.org/wiki/Ensemble_learning) family. - -Both train several decision trees for one dataset. The *main* difference -is that in Random Forests, trees are independent and in boosting, the -`N+1`-st tree focuses its learning on the loss (<=> what has not -been well modeled by the tree `N`). - -This difference can have an impact on a edge case in feature importance -analysis: *correlated features*. - -Imagine two features perfectly correlated, feature `A` and feature `B`. -For one specific tree, if the algorithm needs one of them, it will -choose randomly (true in both boosting and Random Forests). - -However, in Random Forests this random choice will be done for each -tree, because each tree is independent from the others. Therefore, -approximately (and depending on your parameters) 50% of the trees will -choose feature `A` and the other 50% will choose feature `B`. So the -*importance* of the information contained in `A` and `B` (which is the -same, because they are perfectly correlated) is diluted in `A` and `B`. -So you won’t easily know this information is important to predict what -you want to predict! It is even worse when you have 10 correlated -features… - -In boosting, when a specific link between feature and outcome have been -learned by the algorithm, it will try to not refocus on it (in theory it -is what happens, reality is not always that simple). Therefore, all the -importance will be on feature `A` or on feature `B` (but not both). You -will know that one feature has an important role in the link between the -observations and the label. It is still up to you to search for the -correlated features to the one detected as important if you need to know -all of them. - -If you want to try Random Forests algorithm, you can tweak XGBoost -parameters! - -For instance, to compute a model with 1000 trees, with a 0.5 factor on -sampling rows and columns: - - data(agaricus.train, package = 'xgboost') - data(agaricus.test, package = 'xgboost') - train <- agaricus.train - test <- agaricus.test - - #Random Forest - 1000 trees - bst <- xgboost( - data = train$data - , label = train$label - , max_depth = 4 - , num_parallel_tree = 1000 - , subsample = 0.5 - , colsample_bytree = 0.5 - , nrounds = 1 - , objective = "binary:logistic" - ) - - ## [1] train-logloss:0.456201 - - #Boosting - 3 rounds - bst <- xgboost( - data = train$data - , label = train$label - , max_depth = 4 - , nrounds = 3 - , objective = "binary:logistic" - ) - - ## [1] train-logloss:0.444882 - ## [2] train-logloss:0.302428 - ## [3] train-logloss:0.212847 - -> Note that the parameter `round` is set to `1`. - -> [**Random -> Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) -> is a trademark of Leo Breiman and Adele Cutler and is licensed -> exclusively to Salford Systems for the commercial release of the -> software. diff --git a/doc/R-package/index.rst b/doc/R-package/index.rst index 18de8d1c0902..41f4056e5989 100644 --- a/doc/R-package/index.rst +++ b/doc/R-package/index.rst @@ -23,8 +23,7 @@ Tutorials :maxdepth: 2 :titlesonly: - Introduction to XGBoost in R - Understanding your dataset with XGBoost + R Introductory Vignette ************ Other topics diff --git a/doc/R-package/xgboostPresentation.md b/doc/R-package/xgboostPresentation.md deleted file mode 100644 index 9fe4787eb80c..000000000000 --- a/doc/R-package/xgboostPresentation.md +++ /dev/null @@ -1,589 +0,0 @@ - -XGBoost R Tutorial -================== - -## Introduction - - -**XGBoost** is short for e**X**treme **G**radient **Boost**ing package. - -The purpose of this Vignette is to show you how to use **XGBoost** to build a model and make predictions. - -It is an efficient and scalable implementation of gradient boosting framework by @friedman2000additive and @friedman2001greedy. Two solvers are included: - -- *linear* model ; -- *tree learning* algorithm. - -It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily. - -It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions. - -It has several features: - -* Speed: it can automatically do parallel computation on *Windows* and *Linux*, with *OpenMP*. It is generally over 10 times faster than the classical `gbm`. -* Input Type: it takes several types of input data: - * *Dense* Matrix: *R*'s *dense* matrix, i.e. `matrix` ; - * *Sparse* Matrix: *R*'s *sparse* matrix, i.e. `Matrix::dgCMatrix` ; - * Data File: local data files ; - * `xgb.DMatrix`: its own class (recommended). -* Sparsity: it accepts *sparse* input for both *tree booster* and *linear booster*, and is optimized for *sparse* input ; -* Customization: it supports customized objective functions and evaluation functions. - -## Installation - - -### GitHub version - - -For weekly updated version (highly recommended), install from *GitHub*: - - -```r -install.packages("drat", repos="https://cran.rstudio.com") -drat:::addRepo("dmlc") -install.packages("xgboost", repos="http://dmlc.ml/drat/", type = "source") -``` - -> *Windows* users will need to install [Rtools](http://cran.r-project.org/bin/windows/Rtools/) first. - -### CRAN version - - -The version 0.4-2 is on CRAN, and you can install it by: - - -```r -install.packages("xgboost") -``` - -Formerly available versions can be obtained from the CRAN [archive](http://cran.r-project.org/src/contrib/Archive/xgboost) - -## Learning - - -For the purpose of this tutorial we will load **XGBoost** package. - - -```r -require(xgboost) -``` - -### Dataset presentation - - -In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the same as you will use on in your every day life :-). - -Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013. - -### Dataset loading - - -We will load the `agaricus` datasets embedded with the package and will link them to variables. - -The datasets are already split in: - -* `train`: will be used to build the model ; -* `test`: will be used to assess the quality of our model. - -Why *split* the dataset in two parts? - -In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen. - - -```r -data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') -train <- agaricus.train -test <- agaricus.test -``` - -> In the real world, it would be up to you to make this division between `train` and `test` data. The way to do it is out of scope for this article, however `caret` package may [help](http://topepo.github.io/caret/data-splitting.html). - -Each variable is a `list` containing two things, `label` and `data`: - - -```r -str(train) -``` - -``` -## List of 2 -## $ data :Formal class 'dgCMatrix' [package "Matrix"] with 6 slots -## .. ..@ i : int [1:143286] 2 6 8 11 18 20 21 24 28 32 ... -## .. ..@ p : int [1:127] 0 369 372 3306 5845 6489 6513 8380 8384 10991 ... -## .. ..@ Dim : int [1:2] 6513 126 -## .. ..@ Dimnames:List of 2 -## .. .. ..$ : NULL -## .. .. ..$ : chr [1:126] "cap-shape=bell" "cap-shape=conical" "cap-shape=convex" "cap-shape=flat" ... -## .. ..@ x : num [1:143286] 1 1 1 1 1 1 1 1 1 1 ... -## .. ..@ factors : list() -## $ label: num [1:6513] 1 0 0 1 0 0 0 1 0 0 ... -``` - -`label` is the outcome of our dataset meaning it is the binary *classification* we will try to predict. - -Let's discover the dimensionality of our datasets. - - -```r -dim(train$data) -``` - -``` -## [1] 6513 126 -``` - -```r -dim(test$data) -``` - -``` -## [1] 1611 126 -``` - -This dataset is very small to not make the **R** package too heavy, however **XGBoost** is built to manage huge datasets very efficiently. - -As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`): - - -```r -class(train$data)[1] -``` - -``` -## [1] "dgCMatrix" -``` - -```r -class(train$label) -``` - -``` -## [1] "numeric" -``` - -### Basic Training using XGBoost - - -This step is the most critical part of the process for the quality of our model. - -#### Basic training - -We are using the `train` data. As explained above, both `data` and `label` are stored in a `list`. - -In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore, in a dataset mainly made of `0`, memory size is reduced. It is very common to have such a dataset. - -We will train decision tree model using the following parameters: - -* `objective = "binary:logistic"`: we will train a binary classification model ; -* `max.depth = 2`: the trees won't be deep, because our case is very simple ; -* `nthread = 2`: the number of CPU threads we are going to use; -* `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction. - - -```r -bstSparse <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") -``` - -``` -## [0] train-error:0.046522 -## [1] train-error:0.022263 -``` - -> The more complex the relationship between your features and your `label` is, the more passes you need. - -#### Parameter variations - -##### Dense matrix - -Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** matrix. - - -```r -bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") -``` - -``` -## [0] train-error:0.046522 -## [1] train-error:0.022263 -``` - -##### xgb.DMatrix - -**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. This will be useful for the most advanced features we will discover later. - - -```r -dtrain <- xgb.DMatrix(data = train$data, label = train$label) -bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") -``` - -``` -## [0] train-error:0.046522 -## [1] train-error:0.022263 -``` - -##### Verbose option - -**XGBoost** has several features to help you view the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality. - -One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced techniques). - - -```r -# verbose = 0, no message -bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 0) -``` - - -```r -# verbose = 1, print evaluation metric -bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 1) -``` - -``` -## [0] train-error:0.046522 -## [1] train-error:0.022263 -``` - - -```r -# verbose = 2, also print information about tree -bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic", verbose = 2) -``` - -``` -## [11:41:01] amalgamation/../src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=2 -## [0] train-error:0.046522 -## [11:41:01] amalgamation/../src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 4 extra nodes, 0 pruned nodes, max_depth=2 -## [1] train-error:0.022263 -``` - -## Basic prediction using XGBoost - - -## Perform the prediction - - -The purpose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step. - - -```r -pred <- predict(bst, test$data) - -# size of the prediction vector -print(length(pred)) -``` - -``` -## [1] 1611 -``` - -```r -# limit display of predictions to the first 10 -print(head(pred)) -``` - -``` -## [1] 0.28583017 0.92392391 0.28583017 0.28583017 0.05169873 0.92392391 -``` - -These numbers doesn't look like *binary classification* `{0,1}`. We need to perform a simple transformation before being able to use these results. - -## Transform the regression in a binary classification - - -The only thing that **XGBoost** does is a *regression*. **XGBoost** is using `label` vector to build its *regression* model. - -How can we use a *regression* model to perform a binary classification? - -If we think about the meaning of a regression applied to our data, the numbers we get are probabilities that a datum will be classified as `1`. Therefore, we will set the rule that if this probability for a specific datum is `> 0.5` then the observation is classified as `1` (or `0` otherwise). - - -```r -prediction <- as.numeric(pred > 0.5) -print(head(prediction)) -``` - -``` -## [1] 0 1 0 0 0 1 -``` - -## Measuring model performance - - -To measure the model performance, we will compute a simple metric, the *average error*. - - -```r -err <- mean(as.numeric(pred > 0.5) != test$label) -print(paste("test-error=", err)) -``` - -``` -## [1] "test-error= 0.0217256362507759" -``` - -> Note that the algorithm has not seen the `test` data during the model construction. - -Steps explanation: - -1. `as.numeric(pred > 0.5)` applies our rule that when the probability (<=> regression <=> prediction) is `> 0.5` the observation is classified as `1` and `0` otherwise ; -2. `probabilityVectorPreviouslyComputed != test$label` computes the vector of error between true data and computed probabilities ; -3. `mean(vectorOfErrors)` computes the *average error* itself. - -The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**. - -*Multiclass* classification works in a similar way. - -This metric is **0.02** and is pretty low: our yummly mushroom model works well! - -## Advanced features - - -Most of the features below have been implemented to help you to improve your model by offering a better understanding of its content. - - -### Dataset preparation - - -For the following advanced features, we need to put data in `xgb.DMatrix` as explained above. - - -```r -dtrain <- xgb.DMatrix(data = train$data, label=train$label) -dtest <- xgb.DMatrix(data = test$data, label=test$label) -``` - -### Measure learning progress with xgb.train - - -Both `xgboost` (simple) and `xgb.train` (advanced) functions train models. - -One of the special features of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to overfitting. You can see this feature as a cousin of a cross-validation method. The following techniques will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible. - -One way to measure progress in the learning of a model is to provide to **XGBoost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning. - -> in some way it is similar to what we have done above with the average error. The main difference is that above it was after building the model, and now it is during the construction that we measure errors. - -For the purpose of this example, we use `watchlist` parameter. It is a list of `xgb.DMatrix`, each of them tagged with a name. - - -```r -watchlist <- list(train=dtrain, test=dtest) - -bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, objective = "binary:logistic") -``` - -``` -## [0] train-error:0.046522 test-error:0.042831 -## [1] train-error:0.022263 test-error:0.021726 -``` - -**XGBoost** has computed at each round the same average error metric seen above (we set `nrounds` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset. - -Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset. - -If with your own dataset you do not have such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/data-splitting.html). - -For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics. - - -```r -bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic") -``` - -``` -## [0] train-error:0.046522 train-logloss:0.233376 test-error:0.042831 test-logloss:0.226686 -## [1] train-error:0.022263 train-logloss:0.136658 test-error:0.021726 test-logloss:0.137874 -``` - -> `eval.metric` allows us to monitor two new metrics for each round, `logloss` and `error`. - -### Linear boosting - - -Until now, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with the previous command is `booster = "gblinear"` parameter (and removing `eta` parameter). - - -```r -bst <- xgb.train(data=dtrain, booster = "gblinear", nthread = 2, nrounds=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic") -``` - -``` -## [0] train-error:0.024720 train-logloss:0.184616 test-error:0.022967 test-logloss:0.184234 -## [1] train-error:0.004146 train-logloss:0.069885 test-error:0.003724 test-logloss:0.068081 -``` - -In this specific case, *linear boosting* gets slightly better performance metrics than a decision tree based algorithm. - -In simple cases, this will happen because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use. - -### Manipulating xgb.DMatrix - - -#### Save / Load - -Like saving models, `xgb.DMatrix` object (which groups both dataset and outcome) can also be saved using `xgb.DMatrix.save` function. - - -```r -xgb.DMatrix.save(dtrain, "dtrain.buffer") -``` - -``` -## [1] TRUE -``` - -```r -# to load it in, simply call xgb.DMatrix -dtrain2 <- xgb.DMatrix("dtrain.buffer") -``` - -``` -## [11:41:01] 6513x126 matrix with 143286 entries loaded from dtrain.buffer -``` - -```r -bst <- xgb.train(data=dtrain2, max.depth=2, eta=1, nthread = 2, nrounds=2, watchlist=watchlist, objective = "binary:logistic") -``` - -``` -## [0] train-error:0.046522 test-error:0.042831 -## [1] train-error:0.022263 test-error:0.021726 -``` - - - -#### Information extraction - -Information can be extracted from an `xgb.DMatrix` using `getinfo` function. Hereafter we will extract `label` data. - - -```r -label = getinfo(dtest, "label") -pred <- predict(bst, dtest) -err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label) -print(paste("test-error=", err)) -``` - -``` -## [1] "test-error= 0.0217256362507759" -``` - -### View feature importance/influence from the learnt model - - -Feature importance is similar to R gbm package's relative influence (rel.inf). - -``` -importance_matrix <- xgb.importance(model = bst) -print(importance_matrix) -xgb.plot.importance(importance_matrix = importance_matrix) -``` - -#### View the trees from a model - - -You can dump the tree you learned using `xgb.dump` into a text file. - - -```r -xgb.dump(bst, with_stats = TRUE) -``` - -``` -## [1] "booster[0]" -## [2] "0:[f28<-1.00136e-05] yes=1,no=2,missing=1,gain=4000.53,cover=1628.25" -## [3] "1:[f55<-1.00136e-05] yes=3,no=4,missing=3,gain=1158.21,cover=924.5" -## [4] "3:leaf=1.71218,cover=812" -## [5] "4:leaf=-1.70044,cover=112.5" -## [6] "2:[f108<-1.00136e-05] yes=5,no=6,missing=5,gain=198.174,cover=703.75" -## [7] "5:leaf=-1.94071,cover=690.5" -## [8] "6:leaf=1.85965,cover=13.25" -## [9] "booster[1]" -## [10] "0:[f59<-1.00136e-05] yes=1,no=2,missing=1,gain=832.545,cover=788.852" -## [11] "1:[f28<-1.00136e-05] yes=3,no=4,missing=3,gain=569.725,cover=768.39" -## [12] "3:leaf=0.784718,cover=458.937" -## [13] "4:leaf=-0.96853,cover=309.453" -## [14] "2:leaf=-6.23624,cover=20.4624" -``` - -You can plot the trees from your model using ```xgb.plot.tree`` - -``` -xgb.plot.tree(model = bst) -``` - -> if you provide a path to `fname` parameter you can save the trees to your hard drive. - -#### Save and load models - - -Maybe your dataset is big, and it takes time to train a model on it? May be you are not a big fan of losing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required. - -Helpfully for you, **XGBoost** implements such functions. - - -```r -# save model to binary local file -xgb.save(bst, "xgboost.model") -``` - -``` -## [1] TRUE -``` - -> `xgb.save` function should return TRUE if everything goes well and crashes otherwise. - -An interesting test to see how identical our saved model is to the original one would be to compare the two predictions. - - -```r -# load binary model to R -bst2 <- xgb.load("xgboost.model") -pred2 <- predict(bst2, test$data) - -# And now the test -print(paste("sum(abs(pred2-pred))=", sum(abs(pred2-pred)))) -``` - -``` -## [1] "sum(abs(pred2-pred))= 0" -``` - - - -> result is `0`? We are good! - -In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it. - - -```r -# save model to R's raw vector -rawVec <- xgb.save.raw(bst) - -# print class -print(class(rawVec)) -``` - -``` -## [1] "raw" -``` - -```r -# load binary model to R -bst3 <- xgb.load(rawVec) -pred3 <- predict(bst3, test$data) - -# pred3 should be identical to pred -print(paste("sum(abs(pred3-pred))=", sum(abs(pred3-pred)))) -``` - -``` -## [1] "sum(abs(pred3-pred))= 0" -``` - -> Again `0`? It seems that `XGBoost` works pretty well! - -## References diff --git a/doc/R-package/xgboost_introduction.md b/doc/R-package/xgboost_introduction.md new file mode 100644 index 000000000000..2c74284da355 --- /dev/null +++ b/doc/R-package/xgboost_introduction.md @@ -0,0 +1,1012 @@ +# XGBoost for R introduction + + +# Introduction + +**XGBoost** is an optimized distributed gradient boosting library +designed to be highly **efficient**, **flexible** and **portable**. It +implements machine learning algorithms under the [Gradient +Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework. +XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that +solve many data science problems in a fast and accurate way. The same +code runs on major distributed environment (Hadoop, SGE, MPI) and can +solve problems beyond billions of examples. + +For an introduction to the concept of gradient boosting, see the +tutorial [Introduction to Boosted +Trees](https://xgboost.readthedocs.io/en/stable/tutorials/model.html) in +XGBoost’s online docs. + +For more details about XGBoost’s features and usage, see the [online +documentation](https://xgboost.readthedocs.io/en/stable/) which contains +more tutorials, examples, and details. + +This short vignette outlines the basic usage of the R interface for +XGBoost, assuming the reader has some familiarity with the underlying +concepts behind statistical modeling with gradient-boosted decision +trees. + +# Building a predictive model + +At its core, XGBoost consists of a C++ library which offers bindings for +different programming languages, including R. The R package for XGBoost +provides an idiomatic interface similar to those of other statistical +modeling packages using and x/y design, as well as a lower-level +interface that interacts more directly with the underlying core library +and which is similar to those of other language bindings like Python, +plus various helpers to interact with its model objects such as by +plotting their feature importances or converting them to other formats. + +The main function of interest is `xgboost(x, y, ...)`, which calls the +XGBoost model building procedure on observed data of +covariates/features/predictors “x”, and a response variable “y” - it +should feel familiar to users of packages like `glmnet` or `ncvreg`: + +``` r +library(xgboost) +data(ToothGrowth) + +y <- ToothGrowth$supp # the response which we want to model/predict +x <- ToothGrowth[, c("len", "dose")] # the features from which we want to predct it +model <- xgboost(x, y, nthreads = 1, nrounds = 2) +model +``` + + XGBoost model object + Call: + xgboost(x = x, y = y, nrounds = 2, nthreads = 1) + Objective: binary:logistic + Number of iterations: 2 + Number of features: 2 + Classes: OJ, VC + XGBoost model object + Call: + xgboost(x = x, y = y, nrounds = 2, nthreads = 1) + Objective: binary:logistic + Number of iterations: 2 + Number of features: 2 + Classes: OJ, VC + +In this case, the “y” response variable that was supplied is a “factor” +type with two classes (“OJ” and “VC”) - hence, XGBoost builds a binary +classification model for it based on the features “x”, by finding a +maximum likelihood estimate (similar to the `faimily="binomial"` model +from R’s `glm` function) through rule buckets obtained from the sum of +two decision trees (from `nrounds=2`), from which we can then predict +probabilities, log-odds, class with highest likelihood, among others: + +``` r +predict(model, x[1:6, ], type = "response") # probabilities for y's last level ("VC") +predict(model, x[1:6, ], type = "raw") # log-odds +predict(model, x[1:6, ], type = "class") # class with highest probability +``` + +1 +0.6596265435218812 + +0.5402158498764043 + +0.6596265435218814 + +0.6596265435218815 + +0.6596265435218816 + +0.495350033044815 + + + +1 +0.6616302728652952 + +0.1612115055322653 + +0.6616302728652954 + +0.6616302728652955 + +0.6616302728652956 + +-0.0186003148555756 + +1. VC +2. VC +3. VC +4. VC +5. VC +6. OJ + +**Levels**: 1. ‘OJ’ 2. ‘VC’ + +Compared to R’s `glm` function which follows the concepts of “families” +and “links” from GLM theory to fit models for different kinds of +response distributions, XGBoost follows the simpler concept of +“objectives” which mix both of them into one, and which just like `glm`, +allow modeling very different kinds of response distributions +(e.g. discrete choices, real-valued numbers, counts, censored +measurements, etc.) through a common framework. + +XGBoost will automatically determine a suitable objective for the +response given its object class (can pass factors for classification, +numeric vectors for regression, `Surv` objects from the `survival` +package for survival, etc. - see `?xgboost` for more details), but this +can be controlled manually through an `objective` parameter based the +kind of model that is desired: + +``` r +data(mtcars) + +y <- mtcars$mpg +x <- mtcars[, -1] +model_gaussian <- xgboost(x, y, nthreads = 1, nrounds = 2) # default is squared loss (Gaussian) +model_poisson <- xgboost(x, y, objective = "count:poisson", nthreads = 1, nrounds = 2) +model_abserr <- xgboost(x, y, objective = "reg:absoluteerror", nthreads = 1, nrounds = 2) +``` + +*Note: the objective must match with the type of the “y” response +variable - for example, classification objectives for discrete choices +require “factor” types, while regression models for real-valued data +require “numeric” types.* + +# Model parameters + +XGBoost models allow a large degree of control over how they are built. +By their nature, gradient-boosted decision tree ensembles are able to +capture very complex patterns between features in the data and a +response variable, which also means they can suffer from overfitting if +not controlled appropirately. + +For best results, one needs to find suitable parameters for the data +being modeled. Note that XGBoost does not adjust its default +hyperparameters based on the data, and different datasets will require +vastly different hyperparameters for optimal predictive performance. + +For example, for a small dataset like “TootGrowth” which has only two +features and 60 observations, the defaults from XGBoost are an overkill +which lead to severe overfitting - for such data, one might want to have +smaller trees (i.e. more convervative decision rules, capturing simpler +patterns) and fewer of them, for example. + +Parameters can be controlled by passing additional arguments to +`xgboost()`. See `?xgb.params` for details about what parameters are +available to control. + +``` r +y <- ToothGrowth$supp +x <- ToothGrowth[, c("len", "dose")] +model_conservative <- xgboost( + x, y, nthreads = 1, + nrounds = 5, + max_depth = 2, + reg_lambda = 0.5, + learning_rate = 0.15 +) +pred_conservative <- predict( + model_conservative, + x +) +pred_conservative[1:6] # probabilities are all closer to 0.5 now +``` + +1 +0.6509257555007932 + +0.4822041690349583 + +0.6509257555007934 + +0.6509257555007935 + +0.6509257555007936 + +0.447792500257492 + +XGBoost also allows the possibility of calculating evaluation metrics +for model quality over boosting rounds, with a wide variety of built-in +metrics available to use. It’s possible to automatically set aside a +fraction of the data to use as evaluation set, from which one can then +visually monitor progress and overfitting: + +``` r +xgboost( + x, y, nthreads = 1, + eval_set = 0.2, + monitor_training = TRUE, + verbosity = 1, + eval_metric = c("auc", "logloss"), + nrounds = 5, + max_depth = 2, + reg_lambda = 0.5, + learning_rate = 0.15 +) +``` + + [1] train-auc:0.755556 train-logloss:0.663745 + [2] train-auc:0.785556 train-logloss:0.647036 + [3] train-auc:0.792778 train-logloss:0.633082 + [4] train-auc:0.792778 train-logloss:0.612353 + [5] train-auc:0.837778 train-logloss:0.601307 + + XGBoost model object + Call: + xgboost(x = x, y = y, nrounds = 5, verbosity = 1, monitor_training = TRUE, + nthreads = 1, eval_set = 0.2, eval_metric = c("auc", "logloss"), + max_depth = 2, reg_lambda = 0.5, learning_rate = 0.15) + Objective: binary:logistic + Number of iterations: 5 + Number of features: 2 + Classes: OJ, VC + XGBoost model object + Call: + xgboost(x = x, y = y, nrounds = 5, verbosity = 1, monitor_training = TRUE, + nthreads = 1, eval_set = 0.2, eval_metric = c("auc", "logloss"), + max_depth = 2, reg_lambda = 0.5, learning_rate = 0.15) + Objective: binary:logistic + Number of iterations: 5 + Number of features: 2 + Classes: OJ, VC + +# Examining model objects + +XGBoost model objects for the most part consist of a pointer to a C++ +object where most of the information is held and which is interfaced +through the utility functions and methods in the package, but also +contains some R attributes that can be retrieved (and new ones added) +through `attributes()`: + +``` r +attributes(model) +``` + + $call + xgboost(x = x, y = y, nrounds = 2, nthreads = 1) + + $params + $params$objective + [1] "binary:logistic" + + $params$nthread + [1] 1 + + $params$seed + [1] 0 + + $params$validate_parameters + [1] TRUE + + + $names + [1] "ptr" + + $class + [1] "xgboost" "xgb.Booster" + + $metadata + $metadata$y_levels + [1] "OJ" "VC" + + $metadata$n_targets + [1] 1 + +In addition to R attributes (which can be arbitrary R objects), it may +also keep some standardized C-level attributes that one can access and +modify (but which can only be JSON-format): + +``` r +xgb.attributes(model) +``` + +(they are empty for this model) + +… but usually, when it comes to getting something out of a model object, +one would typically want to do this through the built-in utility +functions. Some examples: + +``` r +xgb.importance(model) +``` + +A data.table: 2 × 4 + + ++++++ + + + + + + + + + + + + + + + + + + + + + + +
Feature <chr>Gain <dbl>Cover <dbl>Frequency <dbl>
len0.74442650.68304490.7333333
dose0.25557350.31695510.2666667
+ +``` r +xgb.model.dt.tree(model) +``` + +A data.table: 32 × 10 + + ++++++++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Tree <int>Node <int>ID <chr>Feature <chr>Split <dbl>Yes <chr>No <chr>Missing <chr>Gain <dbl>Cover <dbl>
000-0len19.70-10-20-25.8823528315.000000
010-1dose1.00-30-40-42.502302177.500000
020-2dose2.00-50-60-62.502302177.500000
030-3len8.20-70-80-85.027109624.750000
040-4LeafNANANANA0.360000012.750000
050-5LeafNANANANA-0.360000012.750000
060-6len29.50-90-100-100.930205944.750000
070-7LeafNANANANA0.360000011.500000
080-8len10.00-110-120-120.606334923.250000
090-9len24.50-130-140-140.780284173.750000
0100-10LeafNANANANA0.150000011.000000
0110-11LeafNANANANA-0.300000011.000000
0120-12len13.60-150-160-162.923076872.250000
0130-13LeafNANANANA0.066666671.250000
0140-14LeafNANANANA-0.171428592.500000
0150-15LeafNANANANA0.200000021.250000
0160-16LeafNANANANA-0.300000011.000000
101-0len19.71-11-21-23.5132985114.695991
111-1dose1.01-31-41-41.633090267.308470
121-2dose2.01-51-61-61.654854067.387520
131-3len8.21-71-81-83.567992694.645680
141-4LeafNANANANA0.288350312.662790
151-5LeafNANANANA-0.288350312.662790
161-6len26.71-91-101-100.221531244.724730
171-7LeafNANANANA0.301630231.452431
181-8len11.21-111-121-120.252369403.193249
191-9len24.51-131-141-140.449721662.985818
1101-10LeafNANANANA0.052415501.738913
1111-11LeafNANANANA-0.218600331.472866
1121-12LeafNANANANA-0.038788511.720383
1131-13LeafNANANANA0.055593991.248612
1141-14LeafNANANANA-0.131601291.737206
+ +# Other features + +XGBoost supports many additional features on top of its traditional +gradient-boosting framework, including, among others: + +- Building decision tree models with characteristics such as + per-feature monotonicity constraints or interaction constraints. +- Calculating feature contributions in individual predictions. +- Using custom objectives and custom evaluation metrics. +- Fitting linear models. +- Fitting models on GPUs and/or on data that doesn’t fit in RAM + (“external memory”). + +See the [online +documentation](https://xgboost.readthedocs.io/en/stable/index.html) - +particularly the [tutorials +section](https://xgboost.readthedocs.io/en/stable/tutorials/index.html) - +for a glimpse over further functionalities that XGBoost offers. + +# The low-level interface + +In addition to the `xgboost(x, y, ...)` function, XGBoost also provides +a lower-level interface for creating model objects through the function +`xgb.train()`, which resembles the same `xgb.train` functions in other +language bindings of XGBoost. + +This `xgb.train()` interface exposes additional functionalities (such as +user-supplied callbacks or external-memory data support) and performs +fewer data validations and castings compared to the `xgboost()` function +interface. + +Some key differences between the two interfaces: + +- Unlike `xgboost()` which takes R objects such as `matrix` or + `data.frame` as inputs, the function `xgb.train()` uses XGBoost’s + own data container called “DMatrix”, which can be created from R + objects through the function `xgb.DMatrix()`. Note that there are + other “DMatrix” constructors too, such as “xgb.QuantileDMatrix()”, + which might be more beneficial for some use-cases. +- A “DMatrix” object may contain a mixture of features/covariates, the + response variable, observation weights, base margins, among others; + and unlike `xgboost()`, requires its inputs to have already been + encoded into the representation that XGBoost uses behind the + scenes - for example, while `xgboost()` may take a `factor` object + as “y”, `xgb.DMatrix()` requires instead a binary response variable + to be passed as a vector of zeros and ones. +- Hyperparameters are passed as function arguments in `xgboost()`, + while they are passed as a named list to `xgb.train()`. +- The `xgb.train()` interface keeps less metadata about its inputs - + for example, it will not add levels of factors as column names to + estimated probabilities when calling `predict`. + +Example usage of `xgb.train()`: + +``` r +data("agaricus.train") +dmatrix <- xgb.DMatrix( + data = agaricus.train$data, # a sparse CSC matrix ('dgCMatrix') + label = agaricus.train$label # zeros and ones +) +booster <- xgb.train( + data = dmatrix, + nrounds = 10, + params = list( + objective = "binary:logistic", + nthreads = 1, + max_depth = 3 + ) +) + +data("agaricus.test") +dmatrix_test <- xgb.DMatrix(agaricus.test$data) +pred_prob <- predict(booster, dmatrix_test) +pred_raw <- predict(booster, dmatrix_test, outputmargin = TRUE) +``` + +Model objects produced by `xgb.train()` have class `xgb.Booster`, while +model objects produced by `xgboost()` have class `xgboost`, which is a +subclass of `xgb.Booster`. Their `predict` methods also take different +arguments - for example, `predict.xgboost` has a `type` parameter, while +`predict.xgb.Booster` controls this through binary arguments - but as +`xgboost` is a subclass of `xgb.Booster`, methods for `xgb.Booster` can +be called on `xgboost` objects if needed. + +Utility functions in the XGBoost R package will work with both model +classes - for example: + +``` r +xgb.importance(model) +xgb.importance(booster) +``` + +A data.table: 2 × 4 + + ++++++ + + + + + + + + + + + + + + + + + + + + + + +
Feature <chr>Gain <dbl>Cover <dbl>Frequency <dbl>
len0.74442650.68304490.7333333
dose0.25557350.31695510.2666667
+ +A data.table: 15 × 4 + + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Feature <chr>Gain <dbl>Cover <dbl>Frequency <dbl>
odor=none0.60836875030.34597928710.16949153
stalk-root=club0.09596848070.06957427440.03389831
odor=anise0.06456628530.07777617440.10169492
odor=almond0.05425746590.08651201820.10169492
bruises?=bruises0.05325257620.05352933010.06779661
stalk-root=rooted0.04719925090.06105657070.03389831
spore-print-color=green0.03260961920.14181263080.16949153
odor=foul0.01533029800.01035175750.01694915
stalk-surface-below-ring=scaly0.01268929400.09142303160.08474576
gill-size=broad0.00669731980.03459938580.10169492
odor=pungent0.00270914580.00321935860.01694915
population=clustered0.00257504640.00156163740.03389831
stalk-color-below-ring=yellow0.00169135670.01739035190.01694915
spore-print-color=white0.00127981600.00080311070.01694915
gill-spacing=close0.00080529480.00441108090.03389831
+ +While `xgboost()` aims to provide a user-friendly interface, there are +still many situations where one should prefer the `xgb.train()` +interface - for example: + +- For latency-sensitive applications (e.g. when serving models in real + time), `xgb.train()` will have a speed advantage, as it performs + fewer validations, conversions, and post-processings with metadata. +- If you are developing an R package that depends on XGBoost, + `xgb.train()` will provide a more stable interface (less subject to + changes) and will have lower time/memory overhead. +- If you need functionalities that are not exposed by the `xgboost()` + interface - for example, if your dataset does not fit into the + computer’s RAM, it’s still possible to construct a DMatrix from it + if the data is loaded in batches through `xgb.ExtMemDMatrix()`. From 692e3a2b8e017c3f26778781138a19235ac4a241 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 7 Jan 2025 20:24:42 +0100 Subject: [PATCH 2/9] add script and instructions to update vignette --- R-package/.Rbuildignore | 1 + R-package/update_md_vignette.sh | 8 ++++++++ R-package/vignettes/xgboost_introduction.qmd | 1 - doc/R-package/index.rst | 1 + doc/R-package/updating_vignette.rst | 19 +++++++++++++++++++ doc/R-package/xgboost_introduction.md | 17 ++++++++++------- 6 files changed, 39 insertions(+), 8 deletions(-) create mode 100755 R-package/update_md_vignette.sh create mode 100644 doc/R-package/updating_vignette.rst diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore index b1932e324589..c2b01f681b65 100644 --- a/R-package/.Rbuildignore +++ b/R-package/.Rbuildignore @@ -6,3 +6,4 @@ README.md ^doc$ ^Meta$ +\.py$ diff --git a/R-package/update_md_vignette.sh b/R-package/update_md_vignette.sh new file mode 100755 index 000000000000..9a404bc74071 --- /dev/null +++ b/R-package/update_md_vignette.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env sh +export tmpfile=$(mktemp --suffix=.qmd) +head -n 11 vignettes/xgboost_introduction.qmd > ${tmpfile} +printf "jupyter: ir\n" >> ${tmpfile} +tail -n +12 vignettes/xgboost_introduction.qmd >> ${tmpfile} +quarto render ${tmpfile} --to md -o xgboost_introduction.md +mv xgboost_introduction.md ../doc/R-package/xgboost_introduction.md +rm ${tmpfile} diff --git a/R-package/vignettes/xgboost_introduction.qmd b/R-package/vignettes/xgboost_introduction.qmd index a9622bab50b5..e80094f81ca2 100644 --- a/R-package/vignettes/xgboost_introduction.qmd +++ b/R-package/vignettes/xgboost_introduction.qmd @@ -9,7 +9,6 @@ format: embed-resources: true theme: yeti highlight-style: pygments -jupyter: ir --- # Introduction diff --git a/doc/R-package/index.rst b/doc/R-package/index.rst index 1707347a9338..0e803953e3f3 100644 --- a/doc/R-package/index.rst +++ b/doc/R-package/index.rst @@ -35,3 +35,4 @@ Other topics Handling of indexable elements Developer guide: parameters from core library + Developer guide: updating R vignette diff --git a/doc/R-package/updating_vignette.rst b/doc/R-package/updating_vignette.rst new file mode 100644 index 000000000000..12df779c6e53 --- /dev/null +++ b/doc/R-package/updating_vignette.rst @@ -0,0 +1,19 @@ +Developer guide: updating R vignette +==================================== + +The R package for XGBoost includes an introductory vignette ``xgboost_introduction``, which is built from a quarto source file (``.qmd``). + +This vignette is included in both the CRAN package and the online docs. The vignette in the CRAN package gets automatically built upon creation of the installable package artifact, but the one in the online docs needs to be rebuilt and re-commited if the quarto source file is modified. + +The rendered vignette in the online docs is compiled to ``.md`` instead of ``.html``, hence it needs to be rendered with non-default options, and importantly, **the source vignette should not render any graphical outputs** (just formatted text), as those do not get included when compiling to ``.md``. + +To update the ``.md`` file, execute the file ``update_md_vignette.sh`` under ``/R-package`` (from XGBoost's root) using a POSIX-compliant shell (like ``bash``): + +.. code-block:: shell + ./update_md_vignette.sh + +Since generation of ``.md`` files is from ``.qmd`` is not supported with the default knitr engine, it needs to use the jupyter engine instead, through the R kernel 'ir' - meaning: re-creating the vignette needs an installation of ``quarto``, ``nbconvert``, ``nbformat``, ``ipykernel``, ``ir``, ``R``, `xgboost's R package, and its R dependencies. + +As the IR jupyter kernel is not the default rendering engine for ``.qmd``, the script ``update_md_vignette.py`` modifies the source file by adding an extra line ``jupyter: ir`` in the yaml header, and uses the resulting modified ``.qmd`` file to compile to ``.md``. This line ``jupyter: ir`` should not be committed to the ``.qmd`` source file as it is not available on CRAN and therefore vignettes from the R package wouldn't build with that line included. + +Note that knitr and IR render tables from data frames differently, so the files generated by each engine will not match exactly. diff --git a/doc/R-package/xgboost_introduction.md b/doc/R-package/xgboost_introduction.md index 2c74284da355..efac4c5ed7c8 100644 --- a/doc/R-package/xgboost_introduction.md +++ b/doc/R-package/xgboost_introduction.md @@ -221,16 +221,16 @@ xgboost( ) ``` - [1] train-auc:0.755556 train-logloss:0.663745 - [2] train-auc:0.785556 train-logloss:0.647036 - [3] train-auc:0.792778 train-logloss:0.633082 - [4] train-auc:0.792778 train-logloss:0.612353 - [5] train-auc:0.837778 train-logloss:0.601307 + [1] train-auc:0.703671 train-logloss:0.669935 eval-auc:0.546875 eval-logloss:0.722103 + [2] train-auc:0.703671 train-logloss:0.654512 eval-auc:0.546875 eval-logloss:0.721223 + [3] train-auc:0.703671 train-logloss:0.642302 eval-auc:0.546875 eval-logloss:0.721304 + [4] train-auc:0.819930 train-logloss:0.618349 eval-auc:0.593750 eval-logloss:0.703055 + [5] train-auc:0.848776 train-logloss:0.606215 eval-auc:0.609375 eval-logloss:0.708907 XGBoost model object Call: xgboost(x = x, y = y, nrounds = 5, verbosity = 1, monitor_training = TRUE, - nthreads = 1, eval_set = 0.2, eval_metric = c("auc", "logloss"), + eval_set = 0.2, nthreads = 1, eval_metric = c("auc", "logloss"), max_depth = 2, reg_lambda = 0.5, learning_rate = 0.15) Objective: binary:logistic Number of iterations: 5 @@ -239,7 +239,7 @@ xgboost( XGBoost model object Call: xgboost(x = x, y = y, nrounds = 5, verbosity = 1, monitor_training = TRUE, - nthreads = 1, eval_set = 0.2, eval_metric = c("auc", "logloss"), + eval_set = 0.2, nthreads = 1, eval_metric = c("auc", "logloss"), max_depth = 2, reg_lambda = 0.5, learning_rate = 0.15) Objective: binary:logistic Number of iterations: 5 @@ -271,6 +271,9 @@ attributes(model) $params$seed [1] 0 + $params$verbosity + [1] 0 + $params$validate_parameters [1] TRUE From 107f18d5f1697a6c3d2caf33c82cdeac9b26de15 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Tue, 7 Jan 2025 20:29:35 +0100 Subject: [PATCH 3/9] remove accidental commit --- R-package/.Rbuildignore | 1 - 1 file changed, 1 deletion(-) diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore index c2b01f681b65..b1932e324589 100644 --- a/R-package/.Rbuildignore +++ b/R-package/.Rbuildignore @@ -6,4 +6,3 @@ README.md ^doc$ ^Meta$ -\.py$ From af0b654177717d94d972e9df8adce910e105a36b Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 8 Jan 2025 20:40:43 +0100 Subject: [PATCH 4/9] install quarto for vignette --- R-package/tests/helper_scripts/install_deps.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R-package/tests/helper_scripts/install_deps.R b/R-package/tests/helper_scripts/install_deps.R index 757475541fd4..bde722b5901c 100644 --- a/R-package/tests/helper_scripts/install_deps.R +++ b/R-package/tests/helper_scripts/install_deps.R @@ -11,6 +11,7 @@ pkgs <- c( ## suggests "knitr", "rmarkdown", + "quarto", "ggplot2", "DiagrammeR", "DiagrammeRsvg", From f5d5330f0558a8bf09a494d9ef5c569b0516660e Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 8 Jan 2025 21:13:12 +0100 Subject: [PATCH 5/9] install system quarto --- .github/workflows/r_tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index 43ad372a1e84..7f4fc31d7dfc 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -77,6 +77,10 @@ jobs: # Must run before checkout to have the latest git installed. # No need to add pandoc, the container has it figured out. apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git librsvg2-dev librsvg2-2 -y + - name: Install system quarto + run: | + wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.40/quarto-1.6.40-linux-amd64.deb + dpkg -i quarto-1.6.40-linux-amd64.deb - name: Trust git cloning project sources run: | git config --global --add safe.directory "${GITHUB_WORKSPACE}" From 7698df086078cad46c764b595dc0d8e10920820a Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 8 Jan 2025 21:21:16 +0100 Subject: [PATCH 6/9] install wget --- .github/workflows/r_tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index 7f4fc31d7dfc..dc022c92f099 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -79,6 +79,7 @@ jobs: apt update && apt install libcurl4-openssl-dev libssl-dev libssh2-1-dev libgit2-dev libglpk-dev libxml2-dev libharfbuzz-dev libfribidi-dev git librsvg2-dev librsvg2-2 -y - name: Install system quarto run: | + apt-get install -y wget wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.6.40/quarto-1.6.40-linux-amd64.deb dpkg -i quarto-1.6.40-linux-amd64.deb - name: Trust git cloning project sources From 40008b7e4728e110e1964f74c8f14e14f7b12091 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 8 Jan 2025 21:36:46 +0100 Subject: [PATCH 7/9] remove files that got re-added --- R-package/vignettes/discoverYourData.Rmd | 325 ------------- R-package/vignettes/xgboostPresentation.Rmd | 512 -------------------- 2 files changed, 837 deletions(-) delete mode 100644 R-package/vignettes/discoverYourData.Rmd delete mode 100644 R-package/vignettes/xgboostPresentation.Rmd diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd deleted file mode 100644 index 8574dfe15b3e..000000000000 --- a/R-package/vignettes/discoverYourData.Rmd +++ /dev/null @@ -1,325 +0,0 @@ ---- -title: "Understand your dataset with XGBoost" -output: - rmarkdown::html_vignette: - css: vignette.css - number_sections: yes - toc: yes -author: Tianqi Chen, Tong He, Michaël Benesty, Yuan Tang -vignette: > - %\VignetteIndexEntry{Discover your data} - %\VignetteEngine{knitr::rmarkdown} - \usepackage[utf8]{inputenc} ---- - -Understand your dataset with XGBoost -==================================== - -Introduction ------------- - -The purpose of this vignette is to show you how to use **XGBoost** to discover and understand your own dataset better. - -This vignette is not about predicting anything (see [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). We will explain how to use **XGBoost** to highlight the *link* between the *features* of your data and the *outcome*. - -Package loading: - -```{r libLoading, results='hold', message=F, warning=F} -require(xgboost) -require(Matrix) -require(data.table) -if (!require('vcd')) { - install.packages('vcd') -} - -data.table::setDTthreads(2) -``` - -> **VCD** package is used for one of its embedded dataset only. - -Preparation of the dataset --------------------------- - -### Numeric v.s. categorical variables - - -**XGBoost** manages only `numeric` vectors. - -What to do when you have *categorical* data? - -A *categorical* variable has a fixed number of different values. For instance, if a variable called *Colour* can have only one of these three values, *red*, *blue* or *green*, then *Colour* is a *categorical* variable. - -> In **R**, a *categorical* variable is called `factor`. -> -> Type `?factor` in the console for more information. - -To answer the question above we will convert *categorical* variables to `numeric` ones. - -### Conversion from categorical to numeric variables - -#### Looking at the raw data - -+In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = the majority of the matrix is non-zero) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero entries in the matrix) of `numeric` features. - -The method we are going to see is usually called [one-hot encoding](https://en.wikipedia.org/wiki/One-hot). - -The first step is to load the `Arthritis` dataset in memory and wrap it with the `data.table` package. - -```{r, results='hide'} -data(Arthritis) -df <- data.table(Arthritis, keep.rownames = FALSE) -``` - -> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **XGBoost's** **R** package use `data.table`. - -The first thing we want to do is to have a look to the first few lines of the `data.table`: - -```{r} -head(df) -``` - -Now we will check the format of each column. - -```{r} -str(df) -``` - -2 columns have `factor` type, one has `ordinal` type. - -> `ordinal` variable : -> -> * can take a limited number of values (like `factor`) ; -> * these values are ordered (unlike `factor`). Here these ordered values are: `Marked > Some > None` - -#### Creation of new features based on old ones - -We will add some new *categorical* features to see if it helps. - -##### Grouping per 10 years - -For the first features we create groups of age by rounding the real age. - -Note that we transform it to `factor` so the algorithm treats these age groups as independent values. - -Therefore, 20 is not closer to 30 than 60. In other words, the distance between ages is lost in this transformation. - -```{r} -head(df[, AgeDiscret := as.factor(round(Age / 10, 0))]) -``` - -##### Randomly split into two groups - -The following is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value **based on nothing**. We will see later if simplifying the information based on arbitrary values is a good strategy (you may already have an idea of how well it will work...). - -```{r} -head(df[, AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]) -``` - -##### Risks in adding correlated features - -These new features are highly correlated to the `Age` feature because they are simple transformations of this feature. - -For many machine learning algorithms, using correlated features is not a good idea. It may sometimes make prediction less accurate, and most of the time make interpretation of the model almost impossible. GLM, for instance, assumes that the features are uncorrelated. - -Fortunately, decision tree algorithms (including boosted trees) are very robust to these features. Therefore we don't have to do anything to manage this situation. - -##### Cleaning data - -We remove ID as there is nothing to learn from this feature (it would just add some noise). - -```{r, results='hide'} -df[, ID := NULL] -``` - -We will list the different values for the column `Treatment`: - -```{r} -levels(df[, Treatment]) -``` - - -#### Encoding categorical features - -Next step, we will transform the categorical data to dummy variables. -Several encoding methods exist, e.g., [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) is a common approach. -We will use the [dummy contrast coding](https://stats.oarc.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)). - -The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`. - -For example, the column `Treatment` will be replaced by two columns, `TreatmentPlacebo`, and `TreatmentTreated`. Each of them will be *binary*. Therefore, an observation which has the value `Placebo` in column `Treatment` before the transformation will have the value `1` in the new column `TreatmentPlacebo` and the value `0` in the new column `TreatmentTreated` after the transformation. The column `TreatmentPlacebo` will disappear during the contrast encoding, as it would be absorbed into a common constant intercept column. - -Column `Improved` is excluded because it will be our `label` column, the one we want to predict. - -```{r, warning=FALSE,message=FALSE} -sparse_matrix <- sparse.model.matrix(Improved ~ ., data = df)[, -1] -head(sparse_matrix) -``` - -> Formula `Improved ~ .` used above means transform all *categorical* features but column `Improved` to binary values. The `-1` column selection removes the intercept column which is full of `1` (this column is generated by the conversion). For more information, you can type `?sparse.model.matrix` in the console. - -Create the output `numeric` vector (not as a sparse `Matrix`): - -```{r} -output_vector <- df[, Improved] == "Marked" -``` - -1. set `Y` vector to `0`; -2. set `Y` to `1` for rows where `Improved == Marked` is `TRUE` ; -3. return `Y` vector. - -Build the model ---------------- - -The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). - -```{r} -bst <- xgboost(x = sparse_matrix, y = output_vector, - max_depth = 4, learning_rate = 1, - nthread = 2, nrounds = 10) - -``` - -You can see some `train-logloss: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains the data. Lower is better. - -A small value for training error may be a symptom of [overfitting](https://en.wikipedia.org/wiki/Overfitting), meaning the model will not accurately predict unseen values. - -Feature importance ------------------- - -## Measure feature importance - - -### Build the feature importance data.table - -Remember, each binary column corresponds to a single value of one of *categorical* features. - -```{r} -importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst) -head(importance) -``` - -> The column `Gain` provides the information we are looking for. -> -> As you can see, features are classified by `Gain`. - -`Gain` is the improvement in accuracy brought by a feature to the branches it is on. The idea is that before adding a new split on a feature X to the branch there were some wrongly classified elements; after adding the split on this feature, there are two new branches, and each of these branches is more accurate (one branch saying if your observation is on this branch then it should be classified as `1`, and the other branch saying the exact opposite). - -`Cover` is related to the second order derivative (or Hessian) of the loss function with respect to a particular variable; thus, a large value indicates a variable has a large potential impact on the loss function and so is important. - -`Frequency` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it). - -### Plotting the feature importance - -All these things are nice, but it would be even better to plot the results. - -```{r, fig.width=8, fig.height=5, fig.align='center'} -xgb.plot.importance(importance_matrix = importance) -``` - -Running this line of code, you should get a bar chart showing the importance of the 6 features (containing the same data as the output we saw earlier, but displaying it visually for easier consumption). Note that `xgb.ggplot.importance` is also available for all the ggplot2 fans! - -> Depending of the dataset and the learning parameters you may have more than two clusters. Default value is to limit them to `10`, but you can increase this limit. Look at the function documentation for more information. - -According to the plot above, the most important features in this dataset to predict if the treatment will work are : - -* An individual's age; -* Having received a placebo or not; -* Gender; -* Our generated feature AgeDiscret. We can see that its contribution is very low. - - -### Do these results make sense? - - -Let's check some **Chi2** between each of these features and the label. - -Higher **Chi2** means better correlation. - -```{r, warning=FALSE, message=FALSE} -c2 <- chisq.test(df$Age, output_vector) -print(c2) -``` - -The Pearson correlation between Age and illness disappearing is **`r round(c2$statistic, 2 )`**. - -```{r, warning=FALSE, message=FALSE} -c2 <- chisq.test(df$AgeDiscret, output_vector) -print(c2) -``` - -Our first simplification of Age gives a Pearson correlation of **`r round(c2$statistic, 2)`**. - -```{r, warning=FALSE, message=FALSE} -c2 <- chisq.test(df$AgeCat, output_vector) -print(c2) -``` - -The perfectly random split we did between young and old at 30 years old has a low correlation of **2.36**. This suggests that, for the particular illness we are studying, the age at which someone is vulnerable to this disease is likely very different from 30. - -Moral of the story: don't let your *gut* lower the quality of your model. - -In *data science*, there is the word *science* :-) - -Conclusion ----------- - -As you can see, in general *destroying information by simplifying it won't improve your model*. **Chi2** just demonstrates that. - -But in more complex cases, creating a new feature from an existing one may help the algorithm and improve the model. - -+The case studied here is not complex enough to show that. Check [Kaggle website](https://www.kaggle.com/) for some challenging datasets. - -Moreover, you can see that even if we have added some new features which are not very useful/highly correlated with other features, the boosting tree algorithm was still able to choose the best one (which in this case is the Age). - -Linear models may not perform as well. - -Special Note: What about Random Forests™? ------------------------------------------ - -As you may know, the [Random Forests](https://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) family. - -Both train several decision trees for one dataset. The *main* difference is that in Random Forests, trees are independent and in boosting, the `N+1`-st tree focuses its learning on the loss (<=> what has not been well modeled by the tree `N`). - -This difference can have an impact on a edge case in feature importance analysis: *correlated features*. - -Imagine two features perfectly correlated, feature `A` and feature `B`. For one specific tree, if the algorithm needs one of them, it will choose randomly (true in both boosting and Random Forests). - -However, in Random Forests this random choice will be done for each tree, because each tree is independent from the others. Therefore, approximately (and depending on your parameters) 50% of the trees will choose feature `A` and the other 50% will choose feature `B`. So the *importance* of the information contained in `A` and `B` (which is the same, because they are perfectly correlated) is diluted in `A` and `B`. So you won't easily know this information is important to predict what you want to predict! It is even worse when you have 10 correlated features... - -In boosting, when a specific link between feature and outcome have been learned by the algorithm, it will try to not refocus on it (in theory it is what happens, reality is not always that simple). Therefore, all the importance will be on feature `A` or on feature `B` (but not both). You will know that one feature has an important role in the link between the observations and the label. It is still up to you to search for the correlated features to the one detected as important if you need to know all of them. - -If you want to try Random Forests algorithm, you can tweak XGBoost parameters! - -For instance, to compute a model with 1000 trees, with a 0.5 factor on sampling rows and columns: - -```{r, warning=FALSE, message=FALSE} -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -train <- agaricus.train -test <- agaricus.test - -#Random Forest - 1000 trees -bst <- xgboost( - x = train$data, - y = factor(train$label, levels = c(0, 1)), - max_depth = 4, - num_parallel_tree = 1000, - subsample = 0.5, - colsample_bytree = 0.5, - nrounds = 1, - nthread = 2 -) - -#Boosting - 3 rounds -bst <- xgboost( - x = train$data, - y = factor(train$label, levels = c(0, 1)), - max_depth = 4, - nrounds = 3, - nthread = 2 -) -``` - -> Note that the parameter `nrounds` is set to `1`. - -> [**Random Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software. diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd deleted file mode 100644 index a3c61e3ab9ad..000000000000 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ /dev/null @@ -1,512 +0,0 @@ ---- -title: "XGBoost presentation" -output: - rmarkdown::html_vignette: - css: vignette.css - number_sections: yes - toc: yes -bibliography: xgboost.bib -author: Tianqi Chen, Tong He, Michaël Benesty, David Cortes -vignette: > - %\VignetteIndexEntry{XGBoost presentation} - %\VignetteEngine{knitr::rmarkdown} - \usepackage[utf8]{inputenc} ---- - -XGBoost R Tutorial -================== - -## Introduction - - -**XGBoost** is short for e**X**treme **G**radient **Boost**ing package. - -The purpose of this Vignette is to show you how to use **XGBoost** to build a model and make predictions. - -It is an efficient and scalable implementation of gradient boosting framework by @friedman2000additive and @friedman2001greedy. Two solvers are included: - -- *tree learning* algorithm (in different varieties). -- *linear* model ; - -It supports various objective functions, including *regression*, *classification* (binary and multi-class) and *ranking*. The package is made to be extensible, so that users are also allowed to define their own objective functions easily. - -It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions. - -It has several features: - -* Speed: it can automatically do parallel computations with *OpenMP*. It is generally over 10 times faster than the classical `gbm`. -* Input Type: it takes several types of input data: - * *Dense* Matrix: *R*'s *dense* matrix, i.e. `matrix` ; - * *Sparse* Matrix: *R*'s *sparse* matrix, i.e. `Matrix::dgCMatrix` ; - * Data File: local data files ; - * Data frames (class `data.frame` and sub-classes from it such as `data.table`), taking - both numeric and categorical (factor) features. - * `xgb.DMatrix`: its own class (recommended, also supporting numeric and categorical features). -* Customization: it supports customized objective functions and evaluation functions. - -## Installation - -Package can be easily installed from CRAN: - -```{r, eval=FALSE} -install.packages("xgboost") -``` - -For the development version, see the [GitHub page](https://github.com/dmlc/xgboost) and the [installation docs](https://xgboost.readthedocs.io/en/stable/install.html) for further instructions. - -## Learning - - -For the purpose of this tutorial we will load **XGBoost** package. - -```{r libLoading, results='hold', message=F, warning=F} -require(xgboost) -``` - -### Dataset presentation - - -In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the same as you will use on in your every day life :-). - -Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013. - -### Dataset loading - - -We will load the `agaricus` datasets embedded with the package and will link them to variables. - -The datasets are already split in: - -* `train`: will be used to build the model ; -* `test`: will be used to assess the quality of our model. - -Why *split* the dataset in two parts? - -In the first part we will build our model. In the second part we will want to test it and assess its quality. Without dividing the dataset we would test the model on the data which the algorithm have already seen. - -```{r datasetLoading, results='hold', message=F, warning=F} -data(agaricus.train, package = 'xgboost') -data(agaricus.test, package = 'xgboost') -train <- agaricus.train -test <- agaricus.test -``` - -> In the real world, it would be up to you to make this division between `train` and `test` data. - -Each variable is a `list` containing two things, `label` and `data`: - -```{r dataList, message=F, warning=F} -str(train) -``` - -`label` is the outcome of our dataset meaning it is the binary *classification* we will try to predict. - -Let's discover the dimensionality of our datasets. - -```{r dataSize, message=F, warning=F} -dim(train$data) -dim(test$data) -``` - -This dataset is very small to not make the **R** package too heavy, however **XGBoost** is built to manage huge datasets very efficiently. - -As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`): - -```{r dataClass, message=F, warning=F} -class(train$data)[1] -class(train$label) -``` - -### Basic Training using XGBoost - - -This step is the most critical part of the process for the quality of our model. - -#### Basic training - -We are using the `train` data. As explained above, both `data` and `label` are stored in a `list`. - -In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore, in a dataset mainly made of `0`, memory size is reduced. It is very usual to have such dataset. - -We will train a decision tree model using the following parameters: - -* `objective = "binary:logistic"`: we will train a binary classification model (note that this is set automatically when `y` is a `factor`) ; -* `max_depth = 2`: the trees won't be deep, because our case is very simple ; -* `nthread = 2`: the number of CPU threads we are going to use; -* `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction. - -```{r trainingSparse, message=F, warning=F} -bstSparse <- xgboost( - x = train$data - , y = factor(train$label, levels = c(0, 1)) - , objective = "binary:logistic" - , max_depth = 2 - , learning_rate = 1 - , nrounds = 2 - , nthread = 2 -) -``` - -Note that, while the R function `xgboost()` follows typical R idioms for statistical modeling packages -such as an x/y division and having those as first arguments, it also offers a more flexible `xgb.train` -interface which is more consistent across different language bindings (e.g. arguments are the same as -in the Python XGBoost library) and which exposes some additional functionalities. The `xgb.train` -interface uses XGBoost's own DMatrix class to pass data to it, and accepts the model parameters instead -as a named list: - -```{r} -bstTrInterface <- xgb.train( - data = xgb.DMatrix(train$data, label = train$label, nthread = 1) - , params = xgb.params( - objective = "binary:logistic" - , max_depth = 2 - , learning_rate = 1 - , nthread = 2 - ) - , nrounds = 2 -) -``` - -For the rest of this tutorial, we'll nevertheless be using the `xgboost()` interface which will be -more familiar to users of packages such as GLMNET or Ranger. - -> More complex the relationship between your features and your `label` is, more passes you need. - -#### Parameter variations - -##### Dense matrix - -Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** matrix. - -```{r trainingDense, message=F, warning=F} -bstDense <- xgboost( - x = as.matrix(train$data), - y = factor(train$label, levels = c(0, 1)), - max_depth = 2, - learning_rate = 1, - nthread = 2, - nrounds = 2 -) -``` - -##### Data frame - -As another alternative, XGBoost will also accept `data.frame` objects, from which it can -use numeric, integer and factor columns: - -```{r} -df_train <- as.data.frame(as.matrix(train$data)) -bstDF <- xgboost( - x = df_train, - y = factor(train$label, levels = c(0, 1)), - max_depth = 2, - learning_rate = 1, - nthread = 2, - nrounds = 2 -) -``` - -##### Verbosity levels - -**XGBoost** has several features to help you to view how the learning progresses internally. The purpose is to help you -set the best parameters, which is the key of your model quality. Note that when using the `xgb.train` interface, -one can also use a separate evaluation dataset (e.g. a different subset of the data than the training dataset) on -which to monitor metrics of interest, and it also offers an `xgb.cv` function which automatically splits the data -to create evaluation subsets for you. - -One of the simplest way to see the training progress is to set the `verbosity` option: - -```{r trainingVerbose1, message=T, warning=F} -# verbosity = 1, print evaluation metric -bst <- xgboost( - x = train$data, - y = factor(train$label, levels = c(0, 1)), - max_depth = 2, - learning_rate = 1, - nthread = 2, - objective = "binary:logistic", - nrounds = 5, - verbosity = 1 -) -``` - -## Basic prediction using XGBoost - - -## Perform the prediction - - -The purpose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step. - -```{r predicting, message=F, warning=F} -pred <- predict(bst, test$data) - -# size of the prediction vector -print(length(pred)) - -# limit display of predictions to the first 10 -print(head(pred)) -``` - -These numbers reflect the predicted probabilities of belonging to the class '1' in the 'y' data. Tautologically, -the probability of belonging to the class '0' is then $P(y=0) = 1 - P(y=1)$. This implies: if the number is greater -than 0.5, then according to the model it is more likely than an observation will be of class '1', whereas if the -number if lower than 0.5, it is more likely that the observation will be of class '0': - -```{r predictingTest, message=F, warning=F} -prediction <- as.numeric(pred > 0.5) -print(head(prediction)) -``` - -Note that one can also control the prediction type directly to obtain classes instead of probabilities. - -## Measuring model performance - - -To measure the model performance, we will compute a simple metric, the *accuracy rate*. - -```{r predictingAverageError, message=F, warning=F} -acc <- mean(as.numeric(pred > 0.5) == test$label) -print(paste("test-acc=", acc)) -``` - -> Note that the algorithm has not seen the `test` data during the model construction. - -Steps explanation: - -1. `as.numeric(pred > 0.5)` applies our rule that when the probability (<=> regression <=> prediction) is `> 0.5` the observation is classified as `1` and `0` otherwise ; -2. `probabilityVectorPreviouslyComputed == test$label` whether the predicted class matches with the real data ; -3. `mean(vectorOfMatches)` computes the *accuracy rate* itself. - -The most important thing to remember is that **to obtain the predicted class of an observation, a threshold needs to be applied on the predicted probabilities**. - -*Multiclass* classification works in a similar way. - -This metric is **`r round(acc, 2)`** and is pretty high: our yummy mushroom model works well! - -## Advanced features - - -Most of the features below have been implemented to help you to improve your model by offering a better understanding of its content. - - -### Dataset preparation for xgb.train - - -For the following advanced features, we'll be using the `xgb.train()` interface instead of the `xbgoost()` -interface, so we need to put data in an `xgb.DMatrix` as explained earlier: - -```{r DMatrix, message=F, warning=F} -dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2) -dtest <- xgb.DMatrix(data = test$data, label = test$label, nthread = 2) -``` - -### Measure learning progress with xgb.train - - -Both `xgboost` (simple) and `xgb.train` (advanced) functions train models. - -One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following techniques will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible. - -One way to measure progress in learning of a model is to provide to **XGBoost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning. - -> in some way it is similar to what we have done above with the prediction accuracy. The main difference is that below it was after building the model, and now it is during the construction that we measure quality of predictions. - -For the purpose of this example, we use the `evals` parameter. It is a list of `xgb.DMatrix` objects, each of them tagged with a name. - -```{r evals, message=F, warning=F} -evals <- list(train = dtrain, test = dtest) - -bst <- xgb.train( - data = dtrain - , params = list( - max_depth = 2 - , learning_rate = 1 - , nthread = 2 - , objective = "binary:logistic" - ) - , nrounds = 2 - , evals = evals -) -``` - -**XGBoost** has computed at each round the same (negative of) average log-loss (logarithm of the Bernoulli likelihood) -that it uses as optimization objective to minimize in both of the datasets. Obviously, the `train_logloss` number is -related to the training dataset (the one the algorithm learns from) and the `test_logloss` number to the test dataset. - -Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset. - -If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. - -For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics. - -```{r evals2, message=F, warning=F} -bst <- xgb.train( - data = dtrain - , params = list( - learning_rate = 1 - , max_depth = 2 - , nthread = 2 - , objective = "binary:logistic" - , eval_metric = "error" - , eval_metric = "logloss" - ) - , nrounds = 2 - , evals = evals -) -``` - -> `eval_metric` allows us to monitor two new metrics for each round, `logloss` and `error`. - -### Linear boosting - - -Until now, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `learning_rate` parameter). - -```{r linearBoosting, message=F, warning=F} -bst <- xgb.train( - data = dtrain - , params = list( - booster = "gblinear" - , nthread = 2 - , objective = "binary:logistic" - , eval_metric = "error" - , eval_metric = "logloss" - ) - , nrounds = 2 - , evals = evals -) -``` - -In this specific case, *linear boosting* gets slightly better performance metrics than decision trees based algorithm. - -In simple cases, it will happen because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use. - -### Manipulating xgb.DMatrix - - -#### Save / Load - -Like saving models, `xgb.DMatrix` object (which groups both dataset and outcome) can also be saved using `xgb.DMatrix.save` function. - -```{r DMatrixSave, message=F, warning=F} -fname <- file.path(tempdir(), "dtrain.buffer") -xgb.DMatrix.save(dtrain, fname) -# to load it in, simply call xgb.DMatrix -dtrain2 <- xgb.DMatrix(fname) -bst <- xgb.train( - data = dtrain2 - , params = list( - max_depth = 2 - , learning_rate = 1 - , nthread = 2 - , objective = "binary:logistic" - ) - , nrounds = 2 - , evals = evals -) -``` - -```{r DMatrixDel, include=FALSE} -file.remove(fname) -``` - -#### Information extraction - -Information can be extracted from `xgb.DMatrix` using `getinfo` function. Hereafter we will extract `label` data. - -```{r getinfo, message=F, warning=F} -label <- getinfo(dtest, "label") -pred <- predict(bst, dtest) -err <- as.numeric(sum(as.integer(pred > 0.5) != label)) / length(label) -print(paste("test-error=", err)) -``` - -### View feature importance/influence from the fitted model - - -Feature importance is similar to R gbm package's relative influence (rel.inf). - -```{r} -importance_matrix <- xgb.importance(model = bst) -print(importance_matrix) -xgb.plot.importance(importance_matrix = importance_matrix) -``` - -#### View the trees from a model - - -XGBoost can output the trees it fitted in a standard tabular format: - -```{r} -xgb.model.dt.tree(bst) -``` - -You can plot the trees from your model using ```xgb.plot.tree`` - -```{r} -xgb.plot.tree(model = bst) -``` - -> if you provide a path to `fname` parameter you can save the trees to your hard drive. - -#### Save and load models - - -Maybe your dataset is big, and it takes time to train a model on it? May be you are not a big fan of losing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required. - -XGBoost models can be saved through R functions such as `save` and `saveRDS`, but in addition, it also offers -its own serialization format, which might have better compatibility guarantees across versions of XGBoost and -which can also be loaded into other language bindings: - -```{r saveModel, message=F, warning=F} -# save model to binary local file -fname <- file.path(tempdir(), "xgb_model.ubj") -xgb.save(bst, fname) -``` - -> `xgb.save` function should return `r TRUE` if everything goes well and crashes otherwise. - -An interesting test to see how identical our saved model is to the original one would be to compare the two predictions. - -```{r loadModel, message=F, warning=F} -# load binary model to R -# Note that the number of threads for 'xgb.load' is taken from global config, -# can be modified like this: -RhpcBLASctl::omp_set_num_threads(1) -bst2 <- xgb.load(fname) -xgb.model.parameters(bst2) <- list(nthread = 2) -pred2 <- predict(bst2, test$data) - -# And now the test -print(paste("sum(abs(pred2-pred))=", sum(abs(pred2 - pred)))) -``` - -```{r clean, include=FALSE} -# delete the created model -file.remove(fname) -``` - -> result is `0`? We are good! - -In some very specific cases, you will want to save the model as a *R* raw vector. See below how to do it. - -```{r saveLoadRBinVectorModel, message=F, warning=F} -# save model to R's raw vector -rawVec <- xgb.save.raw(bst) - -# print class -print(class(rawVec)) - -# load binary model to R -bst3 <- xgb.load.raw(rawVec) -xgb.model.parameters(bst3) <- list(nthread = 2) -pred3 <- predict(bst3, test$data) - -# pred2 should be identical to pred -print(paste("sum(abs(pred3-pred))=", sum(abs(pred2 - pred)))) -``` - -> Again `0`? It seems that `XGBoost` works pretty well! - -## References From 682bf2009b5f266b1fd3f65599217d22b68ab0d5 Mon Sep 17 00:00:00 2001 From: david-cortes Date: Wed, 8 Jan 2025 21:39:33 +0100 Subject: [PATCH 8/9] newer syntax --- R-package/vignettes/xgboost_introduction.qmd | 4 ++-- doc/R-package/xgboost_introduction.md | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/R-package/vignettes/xgboost_introduction.qmd b/R-package/vignettes/xgboost_introduction.qmd index e80094f81ca2..7892bc72dc3f 100644 --- a/R-package/vignettes/xgboost_introduction.qmd +++ b/R-package/vignettes/xgboost_introduction.qmd @@ -166,9 +166,9 @@ dmatrix <- xgb.DMatrix( booster <- xgb.train( data = dmatrix, nrounds = 10, - params = list( + params = xgb.params( objective = "binary:logistic", - nthreads = 1, + nthread = 1, max_depth = 3 ) ) diff --git a/doc/R-package/xgboost_introduction.md b/doc/R-package/xgboost_introduction.md index efac4c5ed7c8..35cfec6972cc 100644 --- a/doc/R-package/xgboost_introduction.md +++ b/doc/R-package/xgboost_introduction.md @@ -221,11 +221,11 @@ xgboost( ) ``` - [1] train-auc:0.703671 train-logloss:0.669935 eval-auc:0.546875 eval-logloss:0.722103 - [2] train-auc:0.703671 train-logloss:0.654512 eval-auc:0.546875 eval-logloss:0.721223 - [3] train-auc:0.703671 train-logloss:0.642302 eval-auc:0.546875 eval-logloss:0.721304 - [4] train-auc:0.819930 train-logloss:0.618349 eval-auc:0.593750 eval-logloss:0.703055 - [5] train-auc:0.848776 train-logloss:0.606215 eval-auc:0.609375 eval-logloss:0.708907 + [1] train-auc:0.763021 train-logloss:0.665634 eval-auc:0.444444 eval-logloss:0.697723 + [2] train-auc:0.802083 train-logloss:0.643556 eval-auc:0.527778 eval-logloss:0.695267 + [3] train-auc:0.793403 train-logloss:0.625402 eval-auc:0.472222 eval-logloss:0.701788 + [4] train-auc:0.815972 train-logloss:0.611023 eval-auc:0.527778 eval-logloss:0.703274 + [5] train-auc:0.815972 train-logloss:0.599548 eval-auc:0.527778 eval-logloss:0.706069 XGBoost model object Call: @@ -826,9 +826,9 @@ dmatrix <- xgb.DMatrix( booster <- xgb.train( data = dmatrix, nrounds = 10, - params = list( + params = xgb.params( objective = "binary:logistic", - nthreads = 1, + nthread = 1, max_depth = 3 ) ) From 6b921f008dab596165f2f53387778c8f6b678e0c Mon Sep 17 00:00:00 2001 From: david-cortes Date: Thu, 9 Jan 2025 18:47:37 +0100 Subject: [PATCH 9/9] update docs --- R-package/man/xgboost.Rd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index cf4f9817b6a1..180fd59d3643 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -635,7 +635,8 @@ R's conventions for model fitting and predictions, but which doesn't expose all possible functionalities of the core XGBoost library. See \code{\link[=xgb.train]{xgb.train()}} for a more flexible low-level alternative which is similar across different -language bindings of XGBoost and which exposes the full library's functionalities. +language bindings of XGBoost and which exposes additional functionalities such as training on +external memory data and learning-to-rank objectives. By default, most of the parameters here have a value of \code{NULL}, which signals XGBoost to use its default value. Default values are automatically determined by the XGBoost core library, and are