Skip to content

Commit

Permalink
Revert the changes with normalized scores, this was completely wrong
Browse files Browse the repository at this point in the history
  • Loading branch information
florianneukirchen committed Mar 23, 2024
1 parent 1e8b9ef commit 898fa9b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 98 deletions.
51 changes: 4 additions & 47 deletions algs/scipy_pca_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ class SciPyPCAAlgorithm(QgsProcessingAlgorithm):

OUTPUT = 'OUTPUT'
INPUT = 'INPUT'
NORMALIZED = 'NORMALIZED'
NCOMPONENTS = 'NCOMPONENTS'
PERCENTVARIANCE = 'PERCENTVARIANCE'
DTYPE = 'DTYPE'
Expand All @@ -81,8 +80,6 @@ class SciPyPCAAlgorithm(QgsProcessingAlgorithm):
<i>number of components</i> to keep or the <i>percentage of variance</i> \
explained by the kept components can be set.
<b>Normalize</b> Compute normalized version of PCA scores, by dividing \
by sqrt(n_samples - 1). Number of samples means number of pixels in this case. \
<b>Number of components</b> is only used if the value is greater than 0 and \
smaller than the count of original bands and if percentage of variance is \
Expand All @@ -93,7 +90,7 @@ class SciPyPCAAlgorithm(QgsProcessingAlgorithm):
<b>Output</b> The output raster contains \
the data projected into the principal components \
(i.e. the normalized or unnormalized scores).
(i.e. the PCA scores).
<b>Output data type</b> Float32 or Float64
Expand All @@ -112,24 +109,6 @@ class SciPyPCAAlgorithm(QgsProcessingAlgorithm):
<li>Band Mean</li>
</ul>
<b>Note on Normalization</b> The normalized version of the PCA \
can be calculated in two ways:
<ul>
<li>By first dividing the data by \
sqrt(n_samples - 1); then performing SVD and calculating \
scores as dot productuct of normalized data and V (of the SVD, \
not transposed).</li>
<li>By performing SVD with unnormalized data, then calculating \
loadings by dividing the eigenvectors by sqrt(n_samples - 1) \
and calculating scores as dot product of unnormalized data \
and loadings.
</ul>
Unnmormalized scores are calculated as dot product of \
unnormalized data and the (unnormalized) eigenvectors \
of the SVD. (The result is the same as PCA using the \
python module sklearn without normalizing the data). )
"""

# Init Algorithm
Expand All @@ -147,12 +126,6 @@ def initAlgorithm(self, config):
)
)

self.addParameter(QgsProcessingParameterBoolean(
self.NORMALIZED,
self.tr('Normalize'),
optional=True,
defaultValue=False,
))

self.addParameter(QgsProcessingParameterNumber(
self.NCOMPONENTS,
Expand Down Expand Up @@ -202,8 +175,6 @@ def processAlgorithm(self, parameters, context, feedback):
self.inputlayer = self.parameterAsRasterLayer(parameters, self.INPUT, context)
self.output_raster = self.parameterAsOutputLayer(parameters, self.OUTPUT,context)

self.normalized = self.parameterAsBool(parameters, self.NORMALIZED, context)

self.ncomponents = self.parameterAsInt(parameters, self.NCOMPONENTS,context)
self.percentvariance = self.parameterAsDouble(parameters, self.PERCENTVARIANCE,context)

Expand Down Expand Up @@ -259,9 +230,6 @@ def processAlgorithm(self, parameters, context, feedback):
# The constant used for normalization in PCA is: 1 / sqrt(n_samples)
# or 1 / sqrt(n_samples - 1)

# Normalizing afterwards means:
# Normalized scores are data @ loadings
# Unnormalized scores are data @ VT.T

U, S, VT = linalg.svd(centered,full_matrices=False)

Expand Down Expand Up @@ -308,10 +276,7 @@ def processAlgorithm(self, parameters, context, feedback):
return {}

# Get the scores, i.e. the data in principal components
if self.normalized:
new_array = centered @ loadings
else:
new_array = centered @ VT.T
new_array = centered @ VT.T


# Reshape to original shape
Expand Down Expand Up @@ -365,16 +330,11 @@ def processAlgorithm(self, parameters, context, feedback):
'variance_ratio': variance_ratio.tolist(),
'variance explained cumsum': variance_explained_cumsum.tolist(),
'band mean': col_mean.tolist(),
'is normalized': self.normalized,
})

# Save loadings etc as json in the metadata abstract of the layer
if self.normalized:
layername = "PCA (normalized)"
else:
layername = "PCA (not normalized)"
global updatemetadata
updatemetadata = self.UpdateMetadata(encoded, layername)
updatemetadata = self.UpdateMetadata(encoded)
context.layerToLoadOnCompletionDetails(self.output_raster).setPostProcessor(updatemetadata)

return {self.OUTPUT: self.output_raster,
Expand All @@ -385,7 +345,6 @@ def processAlgorithm(self, parameters, context, feedback):
'variance explained cumsum': variance_explained_cumsum,
'band mean': col_mean,
'eigenvectors': VT.T,
'is normalized': self.normalized,
'json': encoded}


Expand All @@ -401,13 +360,11 @@ class UpdateMetadata(QgsProcessingLayerPostProcessorInterface):
"""
To add metadata in the postprocessing step.
"""
def __init__(self, abstract, layername):
def __init__(self, abstract):
self.abstract = abstract
self.layername = layername
super().__init__()

def postProcessLayer(self, layer, context, feedback):
layer.setName(self.layername)
meta = layer.metadata()
meta.setAbstract(self.abstract)
layer.setMetadata(meta)
Expand Down
62 changes: 11 additions & 51 deletions algs/scipy_pca_helper_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ class SciPyTransformPcBaseclass(QgsProcessingAlgorithm):
_inverse = False
_keepbands = 0
falsemean = False
msg = ""

_bandmean = None
V = None
Expand Down Expand Up @@ -289,8 +288,6 @@ def processAlgorithm(self, parameters, context, feedback):
"""
self.get_parameters(parameters, context)

if self.msg != "":
feedback.reportError(self.tr(self.msg), fatalError=False)

self.ds = gdal.Open(self.inputlayer.source())

Expand Down Expand Up @@ -414,34 +411,14 @@ def json_to_parameters(self, s):
decoded = json.loads(s)
except (json.decoder.JSONDecodeError, ValueError, TypeError):
return None, None
is_normalized = decoded.get("is normalized", None)

if is_normalized is None:
self.msg = "Metadata does not tell if these are normalized scores. Calculating assuming unnormalized scores."
is_normalized = False
print("is normalized", is_normalized)
if is_normalized:
eigenvectors = decoded.get("loadings", None)
else:
eigenvectors = decoded.get("eigenvectors", None)

print(is_normalized)
print(eigenvectors)

eigenvectors = decoded.get("eigenvectors", None)

try:
eigenvectors = np.array(eigenvectors)
except (ValueError, TypeError):
eigenvectors = None

if is_normalized:
eigenvals = decoded.get("variance explained", None)
if not eigenvals is None:
try:
eigenvals = np.array(eigenvals)
eigenvectors = (eigenvectors / eigenvals)
except (ValueError, TypeError):
msg = "Could not read eigenvalues from metadata"

means = decoded.get("band mean", 0)
try:
means = np.array(means)
Expand Down Expand Up @@ -500,29 +477,21 @@ class SciPyTransformToPCAlgorithm(SciPyTransformPcBaseclass):

_help = """
Transform data into given principal components \
with a matrix of weights (eigenvectors or loadings) by taking the \
with a matrix of eigenvectors by taking the \
dot product with a matrix of weights (after centering the data). \
The eigenvectors / loadings can also be read from the metadata of an \
existing PCA layer. Normalized PCA scores are only partially supported.
The eigenvectors can also be read from the metadata of an \
existing PCA layer.
<b>Eigenvectors</b> Matrix of eigenvectors or loadings (as string). \
<b>Eigenvectors</b> Matrix of eigenvectors (as string). \
Optional if the next parameter is set. \
The matrix can be taken from the output of the PCA algorith of this plugin. \
Using eigenvectors, the result will be unnormalized PCA scores. \
Using loadings, the result will be normalized PCA scores, \
but the metadata of the layer will be incorrect (and automatic \
inverse transform from PC does not work).
<b>Read eigenvectors / loadings from PCA layer metadata</b> \
<b>Read eigenvectors from PCA layer metadata</b> \
Reads the weights for the transformation from the metadata \
of a layer that was generated using the PCA algorithm of this plugin. \
Ignored if the parameter <i>eigenvectors</i> is used. \
The eigenvectors are used, if the layer contains unnormalized scores. \
The loadings are used, if the layer contains normalized scores; however, \
in this case, the metadata of the result will not be correct \
(and automatic inverse transform from PC does not work).
Ignored if the parameter <i>eigenvectors</i> is used.
<b>Number of components</b> is only used if the value is greater than 0 and \
smaller than the count of original bands.
Expand Down Expand Up @@ -554,7 +523,7 @@ def initAlgorithm(self, config):
self.addParameter(
QgsProcessingParameterRasterLayer(
self.PARAMETERLAYER,
self.tr('Read eigenvectors/loadings from PCA layer metadata'),
self.tr('Read eigenvectors from PCA layer metadata'),
optional=True,
)
)
Expand Down Expand Up @@ -610,21 +579,12 @@ class SciPyTransformFromPCAlgorithm(SciPyTransformPcBaseclass):
dot product of the scores the with the transpose of the matrix of eigenvectors \
and adding the original means to the result.
Normalized PCA scores are only partially supported, see below. \
The eigenvectors / loadings can also be read from the metadata \
The eigenvectors can also be read from the metadata \
of the input layer, as long as they exist and are complete. \
If the layer contains the PCA generated with the PCA \
algorithm of this plugin (i.e. the meta data is complete), \
the transform works for both normalized and unnormalized scores \
without changing any parameters.
<b>Eigenvectors</b> Matrix of eigenvectors (as string). \
Optional if the next parameter is set. \
The matrix can be taken from the output of the PCA algorith of this plugin. \
Assumes that the input contains unnormalized PCA scores. \
For normalized PCA scores, divide the loadings matrix by \
the eigenvalues ("variance explained") and enter the result \
into the <i>eigenvectors</i> text field.
<b>Mean of original bands</b> As first step of PCA, the data of each \
band is centered by subtracting the means. These must be added \
Expand Down

0 comments on commit 898fa9b

Please sign in to comment.