Skip to content

Commit

Permalink
Add zero replacement
Browse files Browse the repository at this point in the history
  • Loading branch information
nfrerebeau committed Nov 17, 2023
1 parent cac5f71 commit 241745d
Show file tree
Hide file tree
Showing 10 changed files with 170 additions and 29 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ Collate:
'pca.R'
'plot.R'
'reexport.R'
'replace.R'
'show.R'
'simplex.R'
'statistics.R'
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ exportMethods(pca)
exportMethods(perturbation)
exportMethods(plot)
exportMethods(powering)
exportMethods(replace_zero)
exportMethods(scalar)
exportMethods(transform_alr)
exportMethods(transform_clr)
Expand Down
28 changes: 28 additions & 0 deletions R/AllGenerics.R
Original file line number Diff line number Diff line change
Expand Up @@ -932,6 +932,34 @@ setGeneric(
#' @rdname pca
NULL

# Missign Values ===============================================================
#' Zero-Replacement
#'
#' Multiplicative replacement of zeros in compositional data.
#' @param x An \eqn{m \times p}{m x p} [`CompositionMatrix-class`] object.
#' @param value A length-\eqn{p} [`numeric`] vector giving the detection limits
#' of each part (in \eqn{(0,1)}).
#' @param delta A [`numeric`] vector specifying the fraction of the detection
#' limit to be used in replacement.
#' @return
#' An \eqn{m \times p}{m x p} [`CompositionMatrix-class`] object, where all
#' zero values have been replaced.
#' @references
#' Aitchison, J. (1986). *The Statistical Analysis of Compositional Data*.
#' London: Chapman and Hall. \doi{10.1007/978-94-009-4109-0}.
#'
#' Martín-Fernández, J. A., Barceló-Vidal, C. & Pawlowsky-Glahn, V. (2003).
#' Dealing with Zeros and Missing Values in Compositional Data Sets Using
#' Nonparametric Imputation. *Mathematical Geology*, 35(3): 253-278.
#' \doi{10.1023/A:1023866030544}.
#' @example inst/examples/ex-zero.R
#' @author N. Frerebeau
#' @docType methods
#' @family imputation methods
#' @name zero
#' @rdname zero
NULL

# Outliers =====================================================================
#' Outlier Detection
#'
Expand Down
46 changes: 46 additions & 0 deletions R/replace.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# REPLACE ZEROS
#' @include AllGenerics.R
NULL

#' @export
#' @rdname zero
#' @aliases replace_zero,CompositionMatrix-method
setMethod(
f = "replace_zero",
signature = c(x = "CompositionMatrix"),
definition = function(x, value = NULL, delta = 2/3) {
## Validation
D <- ncol(x)
if (is.null(value)) return(x)
if (length(value) == 1) rep(value, D)
if (length(value) > 1) arkhe::assert_length(value, D)
if (length(delta) > 1) arkhe::assert_length(delta, D)

sigma <- value * delta
repl <- apply(X = x, MARGIN = 1, FUN = zero_multiplicative, sigma = sigma)

methods::initialize(x, t(repl))
}
)

zero_additive <- function(x, sigma) {
D <- length(x)

is_zero <- x == 0
Z <- sum(is_zero)

x[is_zero] <- (sigma * (Z + 1) * (D - Z)) / D^2
x[!is_zero] <- x[!is_zero] - (sigma * (Z + 1) * Z) / D^2

x
}
zero_multiplicative <- function(x, sigma) {
D <- length(x)

is_zero <- x == 0

x[is_zero] <- sigma[is_zero]
x[!is_zero] <- x[!is_zero] * (1 - (sum(sigma[is_zero])) / 1)

x
}
13 changes: 13 additions & 0 deletions inst/examples/ex-zero.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
## Create a data.frame
X <- data.frame(
Ca = c(7.72, 0, 3.11, 7.19, 7.41, 5, 0, 1, 4.51),
Fe = c(6.12, 5.88, 5.12, 0, 6.02, 0, 0, 5.28, 5.72),
Na = c(0.97, 1.59, 0, 0.86, 0.76, 0.51, 0.75, 0.52, 0.56)
)

## Coerce to a compositional matrix
Y <- as_composition(X)

## Replace zeros
Z <- replace_zero(Y, value = c(0.02, 0.1, 0.01), delta = 2/3)
Z
Binary file added inst/tinytest/_snaps/zero_multiplicative.rds
Binary file not shown.
11 changes: 11 additions & 0 deletions inst/tinytest/test_zero.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Replace zeros ================================================================
X <- data.frame(
Ca = c(7.72, 0, 3.11, 7.19, 7.41, 5, 0, 1, 4.51),
Fe = c(6.12, 5.88, 5.12, 0, 6.02, 0, 0, 5.28, 5.72),
Na = c(0.97, 1.59, 0, 0.86, 0.76, 0.51, 0.75, 0.52, 0.56)
)
Y <- as_composition(X)

## Multiplicative replacement
Z <- replace_zero(Y, value = c(0.02, 0.1, 0.01), delta = 2/3)
expect_equal_to_reference(Z, file = "_snaps/zero_multiplicative.rds")
54 changes: 54 additions & 0 deletions man/zero.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions pkgdown/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ reference:
- title: Visualization
contents:
- has_concept("plot methods")
- title: Missing Values
contents:
- has_concept("imputation methods")
- title: Multivariate Analysis
contents:
- has_concept("multivariate analysis")
Expand Down
42 changes: 13 additions & 29 deletions vignettes/bibliography.bib
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,6 @@ @article{filzmoser2005
pages = {579--587},
issn = {00983004},
doi = {10.1016/j.cageo.2004.11.013},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0098300404002304},
urldate = {2018-07-01},
langid = {english}
}

Expand All @@ -90,8 +88,6 @@ @article{filzmoser2008
pages = {233--248},
issn = {1874-8961, 1874-8953},
doi = {10.1007/s11004-007-9141-5},
url = {http://link.springer.com/10.1007/s11004-007-9141-5},
urldate = {2017-11-22},
abstract = {Outlier detection based on the Mahalanobis distance (MD) requires an appropriate transformation in case of compositional data. For the family of logratio transformations (additive, centered and isometric logratio transformation) it is shown that the MDs based on classical estimates are invariant to these transformations, and that the MDs based on affine equivariant estimators of location and covariance are the same for additive and isometric logratio transformation. Moreover, for 3-dimensional compositions the data structure can be visualized by contour lines. In higher dimension the MDs of closed and opened data give an impression of the multivariate data behavior.},
langid = {english}
}
Expand Down Expand Up @@ -119,8 +115,6 @@ @article{filzmoser2012
pages = {77--85},
issn = {00983004},
doi = {10.1016/j.cageo.2011.06.014},
url = {http://linkinghub.elsevier.com/retrieve/pii/S0098300411002056},
urldate = {2017-11-26},
abstract = {Compositional data—and most data in geochemistry are of this type—carry relative rather than absolute information. For multivariate outlier detection methods this implies that not the given data but appropriately transformed data need to be used. We use the isometric logratio (ilr) transformation, which seems to be generally the most proper one for theoretical and practical reasons. In this space it is difficult to interpret the outliers, because the reason for outlyingness can be complex. Therefore we introduce tools that support the interpretation of outliers by representing multivariate information in biplots, maps, and univariate scatterplots.},
langid = {english}
}
Expand All @@ -134,8 +128,6 @@ @book{filzmoser2018
publisher = {{Springer-Verlag}},
location = {{Berlin Heidelberg}},
doi = {10.1007/978-3-319-96422-5},
url = {http://link.springer.com/10.1007/978-3-319-96422-5},
urldate = {2020-06-03},
isbn = {978-3-319-96420-1 978-3-319-96422-5},
langid = {english}
}
Expand All @@ -150,8 +142,6 @@ @article{fiserova2011
pages = {455--468},
issn = {1874-8961, 1874-8953},
doi = {10.1007/s11004-011-9333-x},
url = {http://link.springer.com/10.1007/s11004-011-9333-x},
urldate = {2020-06-03},
langid = {english}
}

Expand All @@ -177,8 +167,6 @@ @article{greenacre2021
pages = {271--299},
issn = {2326-8298, 2326-831X},
doi = {10.1146/annurev-statistics-042720-124436},
url = {https://www.annualreviews.org/doi/10.1146/annurev-statistics-042720-124436},
urldate = {2022-11-11},
abstract = {Compositional data are nonnegative data carrying relative, rather than absolute, information—these are often data with a constant-sum constraint on the sample values, for example, proportions or percentages summing to 1\% or 100\%, respectively. Ratios between components of a composition are important since they are unaffected by the particular set of components chosen. Logarithms of ratios (logratios) are the fundamental transformation in the ratio approach to compositional data analysis—all data thus need to be strictly positive, so that zero values present a major problem. Components that group together based on domain knowledge can be amalgamated (i.e., summed) to create new components, and this can alleviate the problem of data zeros. Once compositional data are transformed to logratios, regular univariate and multivariate statistical analysis can be performed, such as dimension reduction and clustering, as well as modeling. Alternative methodologies that come close to the ideals of the logratio approach are also considered, especially those that avoid the problem of data zeros, which is particularly acute in large bioinformatic data sets.},
langid = {english}
}
Expand All @@ -194,8 +182,19 @@ @article{hron2017
pages = {797--814},
issn = {1874-8961, 1874-8953},
doi = {10.1007/s11004-017-9684-z},
url = {http://link.springer.com/10.1007/s11004-017-9684-z},
urldate = {2021-12-17},
langid = {english}
}

@article{martin-fernandez2003,
title = {Dealing with {{Zeros}} and {{Missing Values}} in {{Compositional Data Sets Using Nonparametric Imputation}}},
author = {Martín-Fernández, J. A. and Barceló-Vidal, C. and Pawlowsky-Glahn, V.},
date = {2003},
journaltitle = {Mathematical Geology},
volume = {35},
number = {3},
pages = {253--278},
issn = {08828121},
doi = {10.1023/A:1023866030544},
langid = {english}
}

Expand All @@ -209,8 +208,6 @@ @article{mommsen1988
pages = {47--57},
issn = {0003-813X, 1475-4754},
doi = {10.1111/j.1475-4754.1988.tb00434.x},
url = {http://doi.wiley.com/10.1111/j.1475-4754.1988.tb00434.x},
urldate = {2017-10-16},
abstract = {A new similarity measure is proposed, the ‘dilution factor spread’ or, derived from it, a ‘goodness of fit’ parameter. This has the advantage that raw data without any transformation can be used, diluted samples are recognized and errors of individual concentration values can easily be included. The use of this similarity coefficient in multivariate cluster analyses to construct dendrograms is shown and compared with the use of the well-known similarity measure of Euclidian distance in attribute space and of the cos θ measure.},
langid = {english}
}
Expand All @@ -226,7 +223,6 @@ @article{mommsen2007
pages = {359--371},
issn = {0003-813X, 1475-4754},
doi = {10.1111/j.1475-4754.2007.00306.x},
url = {http://doi.wiley.com/10.1111/j.1475-4754.2007.00306.x},
abstract = {Neutron activation analysis of pottery was established at Bonn in 1983 and has since become one of the primary archaeometry-based analytical techniques at the facility. A brief history of the laboratory and a discussion of the best relative fit procedure for pottery is provided. When comparing concentration data for pottery, a best relative fit should always be considered. This mathematical procedure generally results in ‘sharper’ concentration patterns and improves the separability of chemically not very different compositional groups. This is demonstrated for a set of 30 Late Cypriot (Myc. IIIC1) pottery samples from Sinda, Cyprus, which allow formation of a good reference pattern for this site. Applying factors in the range of 0.82-1.43, a number of samples from Egypt and Palestine can be assigned with high probability to a Cypriot origin.},
langid = {english}
}
Expand Down Expand Up @@ -256,8 +252,6 @@ @article{rousseeuw1990
pages = {633--639},
issn = {0162-1459, 1537-274X},
doi = {10.1080/01621459.1990.10474920},
url = {http://www.tandfonline.com/doi/abs/10.1080/01621459.1990.10474920},
urldate = {2022-06-10},
langid = {english}
}

Expand All @@ -272,16 +266,6 @@ @article{santos2020
pages = {102423},
issn = {2352409X},
doi = {10.1016/j.jasrep.2020.102423},
url = {https://linkinghub.elsevier.com/retrieve/pii/S2352409X20302145},
urldate = {2023-09-13},
langid = {english}
}

@article{tsagris2011,
title = {A Data-Based Power Transformation for Compositional Data},
author = {Tsagris, Michail T. and Preston, Simon and Wood, Andrew T. A.},
date = {2011},
url = {https://arxiv.org/pdf/1106.1451.pdf},
langid = {english}
}

Expand Down

0 comments on commit 241745d

Please sign in to comment.