% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/LogReg_rf_fast.R
\name{LogReg_rf_fast}
\alias{LogReg_rf_fast}
\title{Fast random-forest classifier with stratified CV and in-fold sampling (ranger, caret-free)}
\usage{
LogReg_rf_fast(
  X,
  moz,
  Y,
  number = 5,
  repeats = 1,
  Metric = c("Kappa", "Accuracy", "F1", "AdjRankIndex", "MatthewsCorrelation"),
  Sampling = c("no", "up", "down", "smote"),
  ncores = max(1L, parallel::detectCores() - 1L),
  num.trees = 500L,
  tuneLength = 5L,
  folds_parallel = c("auto", "TRUE", "FALSE"),
  seed = 123L,
  mtry = NULL,
  splitrule = "gini",
  sample.fraction = 1,
  min.node.size.grid = c(1L, 5L, 10L),
  min_node_frac = 1/3
)
}
\arguments{
\item{X}{Numeric matrix or data frame; rows are samples and columns are features (m/z).
Column names must be numeric (coercible with as.numeric), representing the feature m/z.
Non-finite values are set to 0 internally.}

\item{moz}{Numeric vector of m/z to keep. Only columns of X whose numeric names
match values in \code{moz} are used. An error is raised if none match.}

\item{Y}{Factor (or coercible to factor) of class labels; length must equal nrow(X).}

\item{number}{Integer; number of CV folds (k). Default 5.}

\item{repeats}{Integer; number of CV repeats. Default 1.}

\item{Metric}{Character; CV selection metric. One of
"Kappa", "Accuracy", "F1", "AdjRankIndex", "MatthewsCorrelation".
The best hyperparameters maximize this metric averaged over folds.}

\item{Sampling}{Character; class-balancing strategy applied within each training fold
(and before the final fit on the full data). One of "no", "up", "down", "smote".
\itemize{
\item "up": up-samples minority classes to the majority count (base R).
\item "down": down-samples majority classes to the minority count (base R).
\item "smote": uses the package’s internal smote_classif(Y ~ ., data.frame(Y, X), C.perc = "balance").
}}

\item{ncores}{Integer; number of CPU cores to use. Controls both fold-level parallelism
and ranger threads when not parallelizing folds. Default is all but one core.}

\item{num.trees}{Integer; number of trees per ranger model. Default 500.}

\item{tuneLength}{Integer; upper bound on the size of the hyperparameter grid.
If the full grid (mtry × min.node.size) is larger, a random subset of size
\code{tuneLength} is used. Default 5.}

\item{folds_parallel}{Character; "auto", "TRUE", or "FALSE".
\itemize{
\item "auto": parallelize across folds when ncores >= 2 and total folds (number × repeats) >= 2.
\item "TRUE": force fold-level parallelism (PSOCK on Windows).
\item "FALSE": evaluate folds sequentially; ranger then uses up to \code{ncores} threads per fit.
}}

\item{seed}{Integer; RNG seed for reproducibility. Default 123.}

\item{mtry}{Optional integer; if provided, fixes the number of variables tried at each split.
If NULL (default), a small grid around floor(sqrt(p)) is used, where p = number of features.}

\item{splitrule}{Character; ranger split rule (e.g., "gini", "extratrees"). Default "gini".}

\item{sample.fraction}{Numeric in (0, 1]; subsampling fraction per tree in ranger. Default 1.}

\item{min.node.size.grid}{Integer vector; candidate values for ranger’s \code{min.node.size}
used to build the tuning grid. Default c(1, 5, 10).}

\item{min_node_frac}{Numeric in (0, 1]. Safety cap for ranger’s min.node.size
per fold/final fit: the value used is min(requested_min.node.size,
floor(min_node_frac * n_train)), with a lower bound of 1.
This prevents root-only trees (near-uniform class probabilities) on small
training folds (e.g., with SMOTE). Applied inside CV and for the final model.
Default: 1/3 (set to 1 to disable capping).}
}
\value{
A list with:
\itemize{
\item train_mod: list with fields
\itemize{
\item model: the fitted ranger::ranger object (final model on full data)
\item method: "ranger"
\item best_params: data.frame with the best hyperparameters found by CV
\item cv_score: best mean CV score (according to \code{Metric})
\item metric: the metric name used
}
\item boxplot: ggplot object showing the distribution of per-fold metric values
\item Confusion.Matrix: caret::confusionMatrix for predictions of the final model on the full data
\item stats_global: data.frame with columns Metric, Mean, Sd summarizing per-fold metrics
\item resample: data.frame of per-fold metrics (columns: variable, value, fold)
}
}
\description{
Trains a multiclass random-forest classifier using the ranger algorithm with a
compact hyperparameter search and repeated stratified cross-validation. Feature
columns are first subset by the provided m/z list (moz). Class balancing
(no/up/down/SMOTE) is applied only within training folds to avoid leakage, and
again on the full data before fitting the final model. The evaluation across
folds can be parallelized in a Windows-safe manner (PSOCK), while avoiding
CPU oversubscription by giving each fold worker one ranger thread. Returns the
final ranger model, per-fold metrics, a confusion matrix on the full data, and
a ggplot boxplot of resampling metrics.
}
\details{
\itemize{
\item Feature subsetting: X is subset to columns whose numeric names match \code{moz}. This avoids
expensive joins/transposes and guarantees consistent feature order.
\item Cross-validation: folds are stratified by Y and repeated \code{repeats} times. Sampling is applied
only to training indices in each fold (to prevent leakage) and again before the final fit.
\item Hyperparameter search: a compact grid over mtry (around sqrt(p)) and min.node.size
(from \code{min.node.size.grid}), optionally downsampled to \code{tuneLength}. The best combination
maximizes the chosen metric averaged over folds.
\item Parallel strategy: by default ("auto"), the code parallelizes across folds with a PSOCK cluster
(Windows-safe) and sets ranger’s num.threads = 1 inside each worker to avoid oversubscription.
If you set folds_parallel = "FALSE", folds run sequentially and each ranger fit uses up to
\code{ncores} threads for strong single-fit parallelism.
\item Metrics:
\itemize{
\item Accuracy and Cohen’s Kappa computed from the confusion matrix.
\item F1 is macro-averaged across classes.
\item AdjRankIndex uses mclust::adjustedRandIndex.
\item MatthewsCorrelation is the multiclass MCC.
}
}
}
\examples{
\dontrun{
set.seed(1)
X <- matrix(runif(3000), nrow = 100, ncol = 30)
colnames(X) <- as.character(round(seq(1000, 1290, length.out = 30), 4))
moz <- as.numeric(colnames(X))[seq(1, 30, by = 2)]  # keep half the m/z
Y <- factor(sample(letters[1:3], 100, replace = TRUE))

fit <- LogReg_rf_fast(
  X, moz, Y,
  number = 3, repeats = 1,
  Metric = "Kappa",
  Sampling = "no",
  ncores = 4,
  num.trees = 300,
  tuneLength = 4,
  seed = 42
)
fit$train_mod$best_params
fit$Confusion.Matrix
}

}
\seealso{
ranger::ranger, caret::confusionMatrix
}
