Merge remote-tracking branch 'upstream/master'

dmlc · Dec 6, 2024 · af6e023 · af6e023
2 parents 070d23f + 54930ec
commit af6e023
Show file tree

Hide file tree

Showing 61 changed files with 2,048 additions and 316 deletions.
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
@@ -1,4 +1,4 @@
-Thanks for participating in the XGBoost community! We use https://discuss.xgboost.ai for any general usage questions and discussions. The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.  You are always welcomed to post on the forum first :)
+Thanks for participating in the XGBoost community! The issue tracker is used for actionable items such as feature proposals discussion, roadmaps, and bug tracking.
 
 Issues that are inactive for a period of time may get closed. We adopt this policy so that we won't lose track of actionable issues that may fall at the bottom of the pile. Feel free to reopen a new one if you feel there is an additional problem that needs attention when an old one gets closed.
 

diff --git a/.gitignore b/.gitignore
@@ -52,6 +52,7 @@ Debug
 *.bak
 #.Rbuildignore
 R-package.Rproj
+R-package/build/*
 *.cache*
 .mypy_cache/
 doxygen
@@ -144,11 +145,13 @@ credentials.csv
 .bloop
 
 # python tests
+*.bin
 demo/**/*.txt
 *.dmatrix
 .hypothesis
 __MACOSX/
 model*.json
+/tests/python/models/models/
 
 # R tests
 *.htm

diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
@@ -10,6 +10,7 @@ S3method(getinfo,xgb.Booster)
 S3method(getinfo,xgb.DMatrix)
 S3method(length,xgb.Booster)
 S3method(predict,xgb.Booster)
+S3method(predict,xgboost)
 S3method(print,xgb.Booster)
 S3method(print,xgb.DMatrix)
 S3method(print,xgb.cv.synchronous)

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
@@ -423,7 +423,7 @@ NULL
 #'
 #' @description
 #' When it comes to serializing XGBoost models, it's possible to use R serializers such as
-#' [save()] or [saveRDS()] to serialize an XGBoost R model, but XGBoost also provides
+#' [save()] or [saveRDS()] to serialize an XGBoost model object, but XGBoost also provides
 #' its own serializers with better compatibility guarantees, which allow loading
 #' said models in other language bindings of XGBoost.
 #'
@@ -451,23 +451,24 @@ NULL
 #' not used for prediction / importance / plotting / etc.
 #' These R attributes are only preserved when using R's serializers.
 #'
-#' In addition to the regular `xgb.Booster` objects producted by [xgb.train()], the
-#' function [xgboost()] produces a different subclass `xgboost`, which keeps other
-#' additional metadata as R attributes such as class names in classification problems,
-#' and which has a dedicated `predict` method that uses different defaults. XGBoost's
+#' In addition to the regular `xgb.Booster` objects produced by [xgb.train()], the
+#' function [xgboost()] produces objects with a different subclass `xgboost` (which
+#' inherits from `xgb.Booster`), which keeps other additional metadata as R attributes
+#' such as class names in classification problems, and which has a dedicated `predict`
+#' method that uses different defaults and takes different argument names. XGBoost's
 #' own serializers can work with this `xgboost` class, but as they do not keep R
 #' attributes, the resulting object, when deserialized, is downcasted to the regular
 #' `xgb.Booster` class (i.e. it loses the metadata, and the resulting object will use
-#' `predict.xgb.Booster` instead of `predict.xgboost`) - for these `xgboost` objects,
+#' [predict.xgb.Booster()] instead of [predict.xgboost()]) - for these `xgboost` objects,
 #' `saveRDS` might thus be a better option if the extra functionalities are needed.
 #'
 #' Note that XGBoost models in R starting from version `2.1.0` and onwards, and
 #' XGBoost models before version `2.1.0`; have a very different R object structure and
 #' are incompatible with each other. Hence, models that were saved with R serializers
 #' like [saveRDS()] or [save()] before version `2.1.0` will not work with latter
 #' `xgboost` versions and vice versa. Be aware that the structure of R model objects
-#' could in theory change again in the future, so XGBoost's serializers
-#' should be preferred for long-term storage.
+#' could in theory change again in the future, so XGBoost's serializers should be
+#' preferred for long-term storage.
 #'
 #' Furthermore, note that using the package `qs` for serialization will require
 #' version 0.26 or higher of said package, and will have the same compatibility

diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
@@ -126,6 +126,8 @@ xgb.get.handle <- function(object) {
 #'   of the iterations (rounds) otherwise.
 #'
 #'   If passing "all", will use all of the rounds regardless of whether the model had early stopping or not.
+#'
+#'   Not applicable to `gblinear` booster.
 #' @param strict_shape Whether to always return an array with the same dimensions for the given prediction mode
 #'   regardless of the model type - meaning that, for example, both a multi-class and a binary classification
 #'   model would generate output arrays with the same number of dimensions, with the 'class' dimension having
@@ -144,7 +146,13 @@ xgb.get.handle <- function(object) {
 #'
 #'   If passing `TRUE`, then the result will have dimensions in reverse order - for example, rows
 #'   will be the last dimensions instead of the first dimension.
-#' @param base_margin Base margin used for boosting from existing model.
+#' @param base_margin Base margin used for boosting from existing model (raw score that gets added to
+#'   all observations independently of the trees in the model).
+#'
+#'   If supplied, should be either a vector with length equal to the number of rows in `newdata`
+#'   (for objectives which produces a single score per observation), or a matrix with number of
+#'   rows matching to the number rows in `newdata` and number of columns matching to the number
+#'   of scores estimated by the model (e.g. number of classes for multi-class classification).
 #'
 #'   Note that, if `newdata` is an `xgb.DMatrix` object, this argument will
 #'   be ignored as it needs to be added to the DMatrix instead (e.g. by passing it as
@@ -206,6 +214,9 @@ xgb.get.handle <- function(object) {
 #' For multi-class / multi-target, they will be arranged so that columns in the output will have
 #' the leafs from one group followed by leafs of the other group (e.g. order will be `group1:feat1`,
 #' `group1:feat2`, ..., `group2:feat1`, `group2:feat2`, ...).
+#'
+#' If there is more than one parallel tree (e.g. random forests), the parallel trees will be the
+#' last grouping in the resulting order, which will still be 2D.
 #' \item For `predcontrib`: when not multi-class / multi-target, a matrix with dimensions
 #' `[nrows, nfeats+1]`. The last "+ 1" column corresponds to the baseline value.
 #'
@@ -222,7 +233,7 @@ xgb.get.handle <- function(object) {
 #' For multi-class and multi-target, will be a 4D array with dimensions `[nrows, ngroups, nfeats+1, nfeats+1]`
 #' }
 #'
-#' If passing `strict_shape=FALSE`, the result is always an array:
+#' If passing `strict_shape=TRUE`, the result is always a matrix (if 2D) or array (if 3D or higher):
 #' - For normal predictions, the dimension is `[nrows, ngroups]`.
 #' - For `predcontrib=TRUE`, the dimension is `[nrows, ngroups, nfeats+1]`.
 #' - For `predinteraction=TRUE`, the dimension is `[nrows, ngroups, nfeats+1, nfeats+1]`.

diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
@@ -9,12 +9,13 @@
 #' method (`tree_method = "hist"`, which is the default algorithm), but is not usable for the
 #' sorted-indices method (`tree_method = "exact"`), nor for the approximate method
 #' (`tree_method = "approx"`).
+#'
 #' @param data Data from which to create a DMatrix, which can then be used for fitting models or
 #' for getting predictions out of a fitted model.
 #'
-#' Supported input types are as follows:\itemize{
-#' \item `matrix` objects, with types `numeric`, `integer`, or `logical`.
-#' \item `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`.
+#' Supported input types are as follows:
+#' - `matrix` objects, with types `numeric`, `integer`, or `logical`.
+#' - `data.frame` objects, with columns of types `numeric`, `integer`, `logical`, or `factor`
 #'
 #' Note that xgboost uses base-0 encoding for categorical types, hence `factor` types (which use base-1
 #' encoding') will be converted inside the function call. Be aware that the encoding used for `factor`
@@ -23,33 +24,14 @@
 #' was constructed.
 #'
 #' Other column types are not supported.
-#' \item CSR matrices, as class `dgRMatrix` from package `Matrix`.
-#' \item CSC matrices, as class `dgCMatrix` from package `Matrix`. These are **not** supported for
-#' 'xgb.QuantileDMatrix'.
-#' \item Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
-#' as a single row (only when making predictions from a fitted model).
-#' \item Text files in a supported format, passed as a `character` variable containing the URI path to
-#' the file, with an optional format specifier.
-#'
-#' These are **not** supported for `xgb.QuantileDMatrix`. Supported formats are:\itemize{
-#'   \item XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()].
-#'   \item SVMLight (a.k.a. LibSVM) format for CSR matrices. This format can be signaled by suffix
-#'     `?format=libsvm` at the end of the file path. It will be the default format if not
-#'     otherwise specified.
-#'   \item CSV files (comma-separated values). This format can be specified by adding suffix
-#'     `?format=csv` at the end ofthe file path. It will **not** be auto-deduced from file extensions.
-#'   }
+#' - CSR matrices, as class `dgRMatrix` from package `Matrix`.
+#' - CSC matrices, as class `dgCMatrix` from package `Matrix`.
 #'
-#' Be aware that the format of the file will not be auto-deduced - for example, if a file is named 'file.csv',
-#' it will not look at the extension or file contents to determine that it is a comma-separated value.
-#' Instead, the format must be specified following the URI format, so the input to `data` should be passed
-#' like this: `"file.csv?format=csv"` (or `"file.csv?format=csv&label_column=0"` if the first column
-#' corresponds to the labels).
+#' These are **not** supported by `xgb.QuantileDMatrix`.
+#' - XGBoost's own binary format for DMatrices, as produced by [xgb.DMatrix.save()].
+#' - Single-row CSR matrices, as class `dsparseVector` from package `Matrix`, which is interpreted
+#'   as a single row (only when making predictions from a fitted model).
 #'
-#' For more information about passing text files as input, see the articles
-#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/input_format.html}{Text Input Format of DMatrix} and
-#' \href{https://xgboost.readthedocs.io/en/stable/python/python_intro.html#python-data-interface}{Data Interface}.
-#' }
 #' @param label Label of the training data. For classification problems, should be passed encoded as
 #' integers with numeration starting at zero.
 #' @param weight Weight for each instance.
@@ -95,15 +77,9 @@
 #' @param label_lower_bound Lower bound for survival training.
 #' @param label_upper_bound Upper bound for survival training.
 #' @param feature_weights Set feature weights for column sampling.
-#' @param data_split_mode When passing a URI (as R `character`) as input, this signals
-#'   whether to split by row or column. Allowed values are `"row"` and `"col"`.
-#'
-#'   In distributed mode, the file is split accordingly; otherwise this is only an indicator on
-#'   how the file was split beforehand. Default to row.
-#'
-#'   This is not used when `data` is not a URI.
-#' @return An 'xgb.DMatrix' object. If calling 'xgb.QuantileDMatrix', it will have additional
-#' subclass 'xgb.QuantileDMatrix'.
+#' @param data_split_mode Not used yet. This parameter is for distributed training, which is not yet available for the R package.
+#' @return An 'xgb.DMatrix' object. If calling `xgb.QuantileDMatrix`, it will have additional
+#' subclass `xgb.QuantileDMatrix`.
 #'
 #' @details
 #' Note that DMatrix objects are not serializable through R functions such as [saveRDS()] or [save()].
@@ -145,6 +121,9 @@ xgb.DMatrix <- function(
   if (!is.null(group) && !is.null(qid)) {
     stop("Either one of 'group' or 'qid' should be NULL")
   }
+  if (data_split_mode != "row") {
+    stop("'data_split_mode' is not supported yet.")
+  }
   nthread <- as.integer(NVL(nthread, -1L))
   if (typeof(data) == "character") {
     if (length(data) > 1) {

diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R
@@ -86,7 +86,7 @@
 #' @export
 xgb.create.features <- function(model, data, ...) {
   check.deprecation(...)
-  pred_with_leaf <- predict(model, data, predleaf = TRUE)
+  pred_with_leaf <- predict.xgb.Booster(model, data, predleaf = TRUE)
   cols <- lapply(as.data.frame(pred_with_leaf), factor)
   cbind(data, sparse.model.matrix(~ . -1, cols)) # nolint
 }
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
@@ -16,7 +16,7 @@
 #' @param target_class Only relevant for multiclass models. The default (`NULL`)
 #'   averages the SHAP values over all classes. Pass a (0-based) class index
 #'   to show only SHAP values of that class.
-#' @param approxcontrib Passed to `predict()` when `shap_contrib = NULL`.
+#' @param approxcontrib Passed to [predict.xgb.Booster()] when `shap_contrib = NULL`.
 #' @param subsample Fraction of data points randomly picked for plotting.
 #'   The default (`NULL`) will use up to 100k data points.
 #' @param n_col Number of columns in a grid of plots.
@@ -353,7 +353,7 @@ xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
   }
 
   if (is.null(shap_contrib)) {
-    shap_contrib <- predict(
+    shap_contrib <- predict.xgb.Booster(
       model,
       newdata = data,
       predcontrib = TRUE,