When using PCA or other dimensionality reduction methods, we often face questions like:
Cross-validation provides principled answers by testing how well models trained on one subset of data perform on held-out data.
Let’s use the iris dataset to find the optimal number of PCA components via reconstruction error.
set.seed(123)
X <- as.matrix(iris[, 1:4]) # 150 samples x 4 features
# Create 5-fold cross-validation splits
K <- 5
fold_ids <- sample(rep(1:K, length.out = nrow(X)))
folds <- lapply(1:K, function(k) list(
train = which(fold_ids != k),
test = which(fold_ids == k)
))Define the fitting and measurement functions. The measurement function projects test data, reconstructs it, and computes RMSE:
fit_pca <- function(train_data, ncomp) {
pca(train_data, ncomp = ncomp, preproc = center())
}
measure_reconstruction <- function(model, test_data) {
# Project test data to score space
scores <- project(model, test_data)
# Reconstruct: scores %*% t(loadings), then reverse centering
recon <- scores %*% t(model$v)
recon <- inverse_transform(model$preproc, recon)
# Compute RMSE
rmse <- sqrt(mean((test_data - recon)^2))
tibble::tibble(rmse = rmse)
}Now run cross-validation for 1–4 components:
results_list <- lapply(1:4, function(nc) {
cv_res <- cv_generic(
data = X,
folds = folds,
.fit_fun = fit_pca,
.measure_fun = measure_reconstruction,
fit_args = list(ncomp = nc),
backend = "serial"
)
# Extract RMSE from each fold and average
fold_rmse <- sapply(cv_res$metrics, function(m) m$rmse)
data.frame(ncomp = nc, rmse = mean(fold_rmse))
})
cv_results <- do.call(rbind, results_list)
print(cv_results)
#> ncomp rmse
#> 1 1 2.949027e-01
#> 2 2 1.603380e-01
#> 3 3 7.673558e-02
#> 4 4 4.440794e-16Two components capture most of the structure; additional components yield diminishing returns.
The cv_generic() function returns a tibble with three
columns:
# Run CV once to inspect the structure
cv_example <- cv_generic(
X, folds,
.fit_fun = fit_pca,
.measure_fun = measure_reconstruction,
fit_args = list(ncomp = 2)
)
# Structure overview
str(cv_example, max.level = 1)
#> tibble [5 × 3] (S3: tbl_df/tbl/data.frame)
# Extract metrics from all folds
all_metrics <- dplyr::bind_rows(cv_example$metrics)
print(all_metrics)
#> # A tibble: 5 × 1
#> rmse
#> <dbl>
#> 1 0.157
#> 2 0.135
#> 3 0.181
#> 4 0.132
#> 5 0.197Use cv_generic() to compare centering alone versus
z-scoring:
prep_center <- center()
prep_zscore <- colscale(center(), type = "z")
fit_with_prep <- function(train_data, ncomp, preproc) {
pca(train_data, ncomp = ncomp, preproc = preproc)
}
# Compare both strategies with 3 components
cv_center <- cv_generic(
X, folds,
.fit_fun = fit_with_prep,
.measure_fun = measure_reconstruction,
fit_args = list(ncomp = 3, preproc = prep_center)
)
cv_zscore <- cv_generic(
X, folds,
.fit_fun = fit_with_prep,
.measure_fun = measure_reconstruction,
fit_args = list(ncomp = 3, preproc = prep_zscore)
)
rmse_center <- mean(sapply(cv_center$metrics, `[[`, "rmse"))
rmse_zscore <- mean(sapply(cv_zscore$metrics, `[[`, "rmse"))
cat("Center only - RMSE:", round(rmse_center, 4), "\n")
#> Center only - RMSE: 0.0764
cat("Z-score - RMSE:", round(rmse_zscore, 4), "\n")
#> Z-score - RMSE: 0.1053For iris, centering alone performs slightly better since the variables are already on similar scales.
For larger datasets, run folds in parallel:
You can return multiple metrics from the measurement function:
measure_multi <- function(model, test_data) {
scores <- project(model, test_data)
recon <- scores %*% t(model$v)
recon <- inverse_transform(model$preproc, recon)
residuals <- test_data - recon
ss_res <- sum(residuals^2)
ss_tot <- sum((test_data - mean(test_data))^2)
tibble::tibble(
rmse = sqrt(mean(residuals^2)),
mae = mean(abs(residuals)),
r2 = 1 - ss_res / ss_tot
)
}
cv_multi <- cv_generic(
X, folds,
.fit_fun = fit_pca,
.measure_fun = measure_multi,
fit_args = list(ncomp = 3)
)
all_metrics <- dplyr::bind_rows(cv_multi$metrics)
print(all_metrics)
#> # A tibble: 5 × 3
#> rmse mae r2
#> <dbl> <dbl> <dbl>
#> 1 0.0690 0.0475 0.999
#> 2 0.0603 0.0431 0.999
#> 3 0.106 0.0716 0.997
#> 4 0.0624 0.0453 0.999
#> 5 0.0863 0.0680 0.998
cat("\nMean across folds:\n")
#>
#> Mean across folds:
cat("RMSE:", round(mean(all_metrics$rmse), 4), "\n")
#> RMSE: 0.0767
cat("MAE: ", round(mean(all_metrics$mae), 4), "\n")
#> MAE: 0.0551
cat("R2: ", round(mean(all_metrics$r2), 4), "\n")
#> R2: 0.9984| Metric | Description | Interpretation |
|---|---|---|
| RMSE | Root mean squared error | Lower is better; in original units |
| MAE | Mean absolute error | Less sensitive to outliers than RMSE |
| R² | Coefficient of determination | Proportion of variance explained (1 = perfect) |
Always fit preprocessing parameters inside the CV loop:
# WRONG: Preprocessing outside CV leaks information
X_scaled <- scale(X) # Uses mean/sd from ALL samples including test!
cv_wrong <- cv_generic(X_scaled, folds, ...)
# RIGHT: Let the model handle preprocessing internally
# Each fold fits centering/scaling on training data only
fit_pca <- function(train_data, ncomp) {
pca(train_data, ncomp = ncomp, preproc = center())
}The CV framework works with any projector type. The key is writing appropriate fit and measure functions.
# Nyström approximation for kernel PCA
fit_nystrom <- function(train_data, ncomp) {
nystrom_approx(train_data, ncomp = ncomp, nlandmarks = 50, preproc = center())
}
# Kernel-space reconstruction error
measure_kernel <- function(model, test_data) {
S <- project(model, test_data)
K_hat <- S %*% t(S)
Xc <- reprocess(model, test_data)
K_true <- Xc %*% t(Xc)
tibble::tibble(kernel_rmse = sqrt(mean((K_hat - K_true)^2)))
}
cv_nystrom <- cv_generic(
X, folds,
.fit_fun = fit_nystrom,
.measure_fun = measure_kernel,
fit_args = list(ncomp = 10)
)The nystrom_approx() function provides two variants:
method = "standard": Williams–Seeger single-stage
Nyström with the usual scalingmethod = "double": Lim–Jin–Zhang two-stage Nyström
(efficient when p is large)With a centered linear kernel and all points as landmarks
(m = N), the Nyström eigen-decomposition recovers the exact
top eigenpairs of the kernel matrix K = X_c X_c^T. Below is
a reproducible snippet that demonstrates this and shows how to project
new data.
set.seed(123)
X <- matrix(rnorm(80 * 10), 80, 10)
ncomp <- 5
# Exact setting: linear kernel + centering + m = N
fit_std <- nystrom_approx(
X, ncomp = ncomp, landmarks = 1:nrow(X), preproc = center(), method = "standard"
)
# Compare kernel eigenvalues: eig(K) vs fit_std$sdev^2
Xc <- transform(fit_std$preproc, X)
K <- Xc %*% t(Xc)
lam_K <- eigen(K, symmetric = TRUE)$values[1:ncomp]
data.frame(
component = 1:ncomp,
nystrom = sort(fit_std$sdev^2, decreasing = TRUE),
exact_K = sort(lam_K, decreasing = TRUE)
)
#> component nystrom exact_K
#> 1 1 117.64481 117.64481
#> 2 2 112.66863 112.66863
#> 3 3 103.44825 103.44825
#> 4 4 83.17891 83.17891
#> 5 5 78.30886 78.30886
# Relationship with PCA: prcomp() returns singular values / sqrt(n - 1)
p <- prcomp(Xc, center = FALSE, scale. = FALSE)
lam_from_pca <- p$sdev[1:ncomp]^2 * (nrow(X) - 1) # equals eig(K)
data.frame(
component = 1:ncomp,
from_pca = sort(lam_from_pca, decreasing = TRUE),
exact_K = sort(lam_K, decreasing = TRUE)
)
#> component from_pca exact_K
#> 1 1 117.64481 117.64481
#> 2 2 112.66863 112.66863
#> 3 3 103.44825 103.44825
#> 4 4 83.17891 83.17891
#> 5 5 78.30886 78.30886
# Out-of-sample projection for new rows
new_rows <- 1:5
scores_new <- project(fit_std, X[new_rows, , drop = FALSE])
head(scores_new)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] -0.5065700 -0.003782324 -0.89690602 -1.2402365 -0.2466715
#> [2,] -0.3673067 0.489430969 -1.23213982 -1.5330320 0.7247966
#> [3,] 1.9065578 -0.008104080 -0.61654002 1.5191661 1.2188904
#> [4,] -1.5843734 -0.908340577 -0.94045424 -2.8897545 -2.0328659
#> [5,] 0.5690207 0.002415067 0.09988366 -0.0194067 0.5888599
# Double Nyström collapses to standard when l = m = N
fit_dbl <- nystrom_approx(
X, ncomp = ncomp, landmarks = 1:nrow(X), preproc = center(), method = "double", l = nrow(X)
)
all.equal(sort(fit_std$sdev^2, decreasing = TRUE), sort(fit_dbl$sdev^2, decreasing = TRUE))
#> [1] TRUEFor large feature counts (p >> n), set
method = "double" and choose a modest intermediate rank
l to reduce the small problem size. Provide a custom
kernel_func if you need a non-linear kernel (e.g.,
RBF).
# Example RBF kernel
gaussian_kernel <- function(A, B, sigma = 1) {
# ||a-b||^2 = ||a||^2 + ||b||^2 - 2 a·b
G <- A %*% t(B)
a2 <- rowSums(A * A)
b2 <- rowSums(B * B)
D2 <- outer(a2, b2, "+") - 2 * G
exp(-D2 / (2 * sigma^2))
}
fit_rbf <- nystrom_approx(
X, ncomp = 8, nlandmarks = 40, preproc = center(), method = "double", l = 20,
kernel_func = gaussian_kernel
)
scores_rbf <- project(fit_rbf, X[1:10, ])This package includes unit tests that validate Nyström correctness:
m = N (centered linear kernel)l = m = Nproject() reproduces training scores and matches manual
formulas for both methodsSee tests/testthat/test_nystrom.R in the source for
details.
Below we compare PCA and Nyström (linear kernel) via a kernel‑space
RMSE on held‑out folds. For a test block with preprocessed data
X_test_c, the true kernel is
K_true = X_test_c %*% t(X_test_c). With a
rank‑k model, the approximated kernel is
K_hat = S %*% t(S), where S are the component
scores returned by project().
set.seed(202)
# PCA fit function (reuses earlier fit_pca)
fit_pca <- function(train_data, ncomp) {
pca(train_data, ncomp = ncomp, preproc = center())
}
# Nyström fit function (standard variant, linear kernel, no RSpectra needed for small data)
fit_nystrom <- function(train_data, ncomp, nlandmarks = 50) {
nystrom_approx(train_data, ncomp = ncomp, nlandmarks = nlandmarks,
preproc = center(), method = "standard", use_RSpectra = FALSE)
}
# Kernel-space RMSE metric for a test fold
measure_kernel_rmse <- function(model, test_data) {
S <- project(model, test_data)
K_hat <- S %*% t(S)
Xc <- reprocess(model, test_data)
K_true <- Xc %*% t(Xc)
tibble::tibble(kernel_rmse = sqrt(mean((K_hat - K_true)^2)))
}
# Use a local copy of iris data and local folds for this comparison
X_cv <- as.matrix(scale(iris[, 1:4]))
K <- 5
fold_ids <- sample(rep(1:K, length.out = nrow(X_cv)))
folds_cv <- lapply(1:K, function(k) list(
train = which(fold_ids != k),
test = which(fold_ids == k)
))
# Compare for k = 3 components
k_sel <- 3
cv_pca_kernel <- cv_generic(
X_cv, folds_cv,
.fit_fun = fit_pca,
.measure_fun = measure_kernel_rmse,
fit_args = list(ncomp = k_sel)
)
cv_nys_kernel <- cv_generic(
X_cv, folds_cv,
.fit_fun = fit_nystrom,
.measure_fun = measure_kernel_rmse,
fit_args = list(ncomp = k_sel, nlandmarks = 50)
)
metrics_pca <- dplyr::bind_rows(cv_pca_kernel$metrics)
metrics_nys <- dplyr::bind_rows(cv_nys_kernel$metrics)
rmse_pca <- mean(metrics_pca$kernel_rmse, na.rm = TRUE)
rmse_nys <- mean(metrics_nys$kernel_rmse, na.rm = TRUE)
cv_summary <- data.frame(
method = c("PCA", "Nyström (linear)"),
kernel_rmse = c(rmse_pca, rmse_nys)
)
print(cv_summary)
#> method kernel_rmse
#> 1 PCA 0.02153248
#> 2 Nyström (linear) 0.02266859
# Simple bar plot
ggplot(cv_summary, aes(x = method, y = kernel_rmse, fill = method)) +
geom_col(width = 0.6) +
guides(fill = "none") +
labs(title = "Cross‑validated kernel RMSE (k = 3)", y = "Kernel RMSE", x = NULL)The multivarious CV framework provides: - Easy
cross-validation for any dimensionality reduction method - Flexible
metric calculation - Parallel execution support - Tidy output format for
easy analysis
Use it to make informed decisions about model complexity and ensure your dimensionality reduction generalizes well to new data.