#' @title Statistical Utilities for SVG Detection
#'
#' @description
#' Statistical utility functions used by SVG detection methods, including
#' Moran's I calculation, p-value computation, and expression binarization.
#'
#' @return See individual function documentation for return values.
#' @name utils_stats
NULL


#' Calculate Moran's I Statistic
#'
#' @description
#' Computes Moran's I spatial autocorrelation statistic for a numeric vector
#' given a spatial weights matrix.
#'
#' @param x Numeric vector of values (e.g., gene expression).
#' @param W Square numeric matrix of spatial weights.
#'   Must have the same dimension as length(x).
#' @param standardize Logical. If TRUE (default), row-standardize the weights matrix.
#'
#' @return A list containing:
#'   \itemize{
#'     \item \code{observed}: The observed Moran's I statistic
#'     \item \code{expected}: Expected value under null hypothesis of no spatial
#'       autocorrelation (typically -1/(n-1))
#'     \item \code{sd}: Standard deviation under null hypothesis
#'   }
#'
#' @details
#' Moran's I is defined as:
#' \deqn{I = \frac{n}{W} \frac{\sum_i \sum_j w_{ij}(x_i - \bar{x})(x_j - \bar{x})}{\sum_i (x_i - \bar{x})^2}}
#'
#' where n is the number of observations, W is the sum of all weights, and
#' w_ij is the weight between locations i and j.
#'
#' Under the null hypothesis of no spatial autocorrelation:
#' \itemize{
#'   \item Expected value: E[I] = -1/(n-1)
#'   \item Variance is computed using the analytical formula from Cliff and Ord (1981)
#' }
#'
#' @examples
#' # Create example data
#' set.seed(42)
#' x <- rnorm(100)
#' coords <- cbind(runif(100), runif(100))
#'
#' \donttest{
#' # Calculate Moran's I (requires RANN package)
#' if (requireNamespace("RANN", quietly = TRUE)) {
#'     W <- buildSpatialNetwork(coords, method = "knn", k = 6)
#'     result <- moranI(x, W)
#'     print(result)
#' }
#' }
#'
#' @references
#' Cliff, A.D. and Ord, J.K. (1981) Spatial Processes: Models & Applications. Pion.
#'
#' @export
moranI <- function(x, W, standardize = TRUE) {

    # Validate inputs
    if (!is.numeric(x)) {
        stop("x must be a numeric vector")
    }

    if (!is.matrix(W)) {
        W <- as.matrix(W)
    }

    N <- length(x)

    if (nrow(W) != N || ncol(W) != N) {
        stop("W must be a square matrix with dimension equal to length(x)")
    }

    # Row standardize weights if requested
    if (standardize) {
        rs <- rowSums(W)
        rs[rs == 0] <- 1
        W <- W / rs
    }

    # Calculate Moran's I
    W_sum <- sum(W)
    z <- x - mean(x)

    # Numerator: spatial covariance
    cv <- sum(W * outer(z, z))

    # Denominator: total variance
    v <- sum(z^2)

    # Moran's I
    I_obs <- (N / W_sum) * (cv / v)

    # Expected value under null
    E_I <- -1 / (N - 1)

    # Variance calculation (analytical formula)
    S1 <- 0.5 * sum((W + t(W))^2)
    S2 <- sum((rowSums(W) + colSums(W))^2)

    S3 <- (sum(z^4) / N) / (v / N)^2

    S4 <- (N^2 - 3 * N + 3) * S1 - N * S2 + 3 * W_sum^2
    S5 <- (N^2 - N) * S1 - 2 * N * S2 + 6 * W_sum^2

    E_I2 <- (N * S4 - S3 * S5) / ((N - 1) * (N - 2) * (N - 3) * W_sum^2)
    V_I <- E_I2 - E_I^2

    sd_I <- sqrt(max(V_I, 0))  # Ensure non-negative

    return(list(
        observed = I_obs,
        expected = E_I,
        sd = sd_I
    ))
}


#' Moran's I Test for Spatial Autocorrelation
#'
#' @description
#' Performs a statistical test for spatial autocorrelation using Moran's I.
#' Returns the test statistic, expected value, standard deviation, and p-value.
#'
#' @param x Numeric vector of values.
#' @param W Square numeric matrix of spatial weights.
#' @param alternative Character string specifying the alternative hypothesis.
#'   One of "greater" (default), "less", or "two.sided".
#'   \itemize{
#'     \item \code{"greater"}: Test for positive spatial autocorrelation
#'       (similar values cluster together)
#'     \item \code{"less"}: Test for negative spatial autocorrelation
#'       (dissimilar values are neighbors)
#'     \item \code{"two.sided"}: Test for any spatial autocorrelation
#'   }
#' @param standardize Logical. If TRUE (default), row-standardize weights.
#'
#' @return A named numeric vector with components:
#'   \itemize{
#'     \item \code{observed}: Observed Moran's I
#'     \item \code{expected}: Expected Moran's I under null
#'     \item \code{sd}: Standard deviation under null
#'     \item \code{p.value}: P-value from normal approximation
#'   }
#'
#' @examples
#' set.seed(42)
#' x <- rnorm(100)
#' coords <- cbind(runif(100), runif(100))
#'
#' \donttest{
#' # Test for spatial autocorrelation (requires RANN package)
#' if (requireNamespace("RANN", quietly = TRUE)) {
#'     W <- buildSpatialNetwork(coords, method = "knn", k = 6)
#'     result <- moranI_test(x, W)
#'     print(result)
#' }
#' }
#'
#' @export
moranI_test <- function(x,
                        W,
                        alternative = c("greater", "less", "two.sided"),
                        standardize = TRUE) {

    alternative <- match.arg(alternative)

    # Calculate Moran's I statistics
    moran_stats <- moranI(x, W, standardize = standardize)

    # Calculate p-value
    pv <- pnorm(moran_stats$observed,
                mean = moran_stats$expected,
                sd = moran_stats$sd)

    if (alternative == "greater") {
        pv <- 1 - pv
    } else if (alternative == "two.sided") {
        if (moran_stats$observed <= moran_stats$expected) {
            pv <- 2 * pv
        } else {
            pv <- 2 * (1 - pv)
        }
    }
    # For "less", pv is already correct

    return(c(
        observed = moran_stats$observed,
        expected = moran_stats$expected,
        sd = moran_stats$sd,
        p.value = pv
    ))
}


#' ACAT: Aggregated Cauchy Association Test
#'
#' @description
#' Combines multiple p-values using the Aggregated Cauchy Association Test (ACAT).
#' This method is robust and maintains correct type I error even with correlated
#' p-values.
#'
#' @param pvals Numeric vector of p-values to combine.
#' @param weights Numeric vector of weights. If NULL (default), equal weights are used.
#'
#' @return A single combined p-value.
#'
#' @details
#' ACAT transforms p-values using the Cauchy distribution and combines them:
#' \deqn{T = \sum_i w_i \tan(\pi(0.5 - p_i))}
#'
#' The combined p-value is then computed from the Cauchy distribution.
#'
#' This method has several advantages:
#' \itemize{
#'   \item Valid even when p-values are correlated
#'   \item Computationally simple
#'   \item Handles edge cases (p = 0 or 1) gracefully
#' }
#'
#' @references
#' Liu, Y. et al. (2019) ACAT: A Fast and Powerful P Value Combination Method
#' for Rare-Variant Analysis in Sequencing Studies. The American Journal of
#' Human Genetics.
#'
#' @examples
#' # Combine independent p-values
#' pvals <- c(0.01, 0.05, 0.3)
#' combined_p <- ACAT_combine(pvals)
#' print(combined_p)
#'
#' @export
ACAT_combine <- function(pvals, weights = NULL) {

    # Input validation
    if (any(is.na(pvals))) {
        stop("P-values cannot contain NA values")
    }

    if (any(pvals < 0) || any(pvals > 1)) {
        stop("P-values must be between 0 and 1")
    }

    # Handle edge cases
    if (any(pvals == 0) && any(pvals == 1)) {
        stop("Cannot have both 0 and 1 p-values")
    }

    if (any(pvals == 0)) {
        return(0)
    }

    if (any(pvals == 1)) {
        warning("P-value of exactly 1 detected")
        return(1)
    }

    # Set up weights
    if (is.null(weights)) {
        weights <- rep(1 / length(pvals), length(pvals))
    } else {
        if (length(weights) != length(pvals)) {
            stop("Length of weights must equal length of pvals")
        }
        if (any(weights < 0)) {
            stop("All weights must be non-negative")
        }
        weights <- weights / sum(weights)
    }

    # Calculate ACAT statistic
    # Handle very small p-values separately for numerical stability
    is_small <- (pvals < 1e-16)

    if (sum(is_small) == 0) {
        # Standard calculation
        T_stat <- sum(weights * tan((0.5 - pvals) * pi))
    } else {
        # For very small p-values: tan(pi/2 - pi*p) ≈ 1/(pi*p)
        T_stat <- sum((weights[is_small] / pvals[is_small]) / pi)
        T_stat <- T_stat + sum(weights[!is_small] * tan((0.5 - pvals[!is_small]) * pi))
    }

    # Calculate combined p-value from Cauchy distribution
    if (T_stat > 1e15) {
        # For very large T, use approximation
        combined_p <- (1 / T_stat) / pi
    } else {
        combined_p <- 1 - pcauchy(T_stat)
    }

    return(combined_p)
}


#' Liu's Method for Approximating P-values
#'
#' @description
#' Approximates p-values for quadratic forms in normal variables using
#' Liu's moment-matching method.
#'
#' @param q Test statistic (quadratic form value).
#' @param lambda Vector of eigenvalues.
#'
#' @return Approximate p-value.
#'
#' @details
#' This method matches the first four moments of the quadratic form to a
#' chi-square distribution with appropriate degrees of freedom and non-centrality.
#'
#' @references
#' Liu, H., Tang, Y., and Zhang, H.H. (2009). A new chi-square approximation
#' to the distribution of non-negative definite quadratic forms in non-central
#' normal variables. Computational Statistics & Data Analysis.
#'
#' @keywords internal
liu_pvalue <- function(q, lambda) {

    # Input validation
    if (length(lambda) == 0 || all(lambda <= 0)) {
        return(NA)
    }

    # Calculate cumulants
    c1 <- sum(lambda)
    c2 <- sum(lambda^2)
    c3 <- sum(lambda^3)
    c4 <- sum(lambda^4)

    # Handle edge cases
    if (c2 < 1e-10) return(NA)
    if (c3 == 0) return(NA)

    # Skewness and kurtosis
    s1 <- c3 / (c2^1.5)
    s2 <- c4 / (c2^2)

    # Determine parameters
    if (s1^2 > s2) {
        denom <- s1 - sqrt(s1^2 - s2)
        if (abs(denom) < 1e-10) return(NA)
        a <- 1 / denom
        delta <- s1 * a^3 - a^2
        l <- a^2 - 2 * delta
    } else {
        if (abs(s1) < 1e-10) return(NA)
        a <- 1 / s1
        delta <- 0
        l <- c2^3 / c3^2
    }

    # Check valid parameters
    if (l <= 0 || !is.finite(l)) return(NA)
    if (delta < 0) delta <- 0  # ncp must be non-negative

    # Standardize and compute p-value
    mu_Q <- c1
    sigma_Q <- sqrt(2 * c2)

    if (sigma_Q < 1e-10) return(NA)

    q_star <- (q - mu_Q) / sigma_Q * sqrt(2 * l) + l
    if (!is.finite(q_star)) return(NA)

    pval <- pchisq(q_star, df = l, ncp = delta, lower.tail = FALSE)

    # Ensure p-value is in valid range
    pval <- max(0, min(1, pval))

    return(pval)
}


#' Davies' Method for Quadratic Form P-values
#'
#' @description
#' Computes exact p-values for quadratic forms using Davies' algorithm.
#' Falls back to Liu's method if Davies fails.
#'
#' @param q Test statistic.
#' @param lambda Vector of eigenvalues.
#'
#' @return P-value.
#'
#' @details
#' This function wraps CompQuadForm::davies() with fallback to Liu's method.
#' Davies' method is more accurate but can fail for extreme parameter values.
#'
#' @keywords internal
davies_pvalue <- function(q, lambda) {

    # Try using CompQuadForm if available
    if (requireNamespace("CompQuadForm", quietly = TRUE)) {
        result <- try(CompQuadForm::davies(q, lambda), silent = TRUE)

        if (!inherits(result, "try-error") && result$Qq > 0) {
            return(result$Qq)
        }
    }

    # Fallback to Liu's method
    return(liu_pvalue(q, lambda))
}


#' Binarize Gene Expression
#'
#' @description
#' Converts continuous gene expression values to binary (0/1) using various methods.
#' Used by the binSpect method.
#'
#' @param expr_matrix Numeric matrix of gene expression.
#'   Rows are genes, columns are spots/cells.
#' @param method Character string specifying binarization method.
#'   \itemize{
#'     \item \code{"kmeans"} (default): Use k-means clustering (k=2) to separate
#'       high and low expression
#'     \item \code{"rank"}: Binarize based on expression rank percentile
#'     \item \code{"median"}: Values above median are set to 1
#'     \item \code{"mean"}: Values above mean are set to 1
#'   }
#' @param rank_percent Numeric. For \code{method = "rank"}, the percentile
#'   threshold (0-100). Values in the top \code{rank_percent} percent are set to 1.
#'   Default is 30.
#' @param n_threads Integer. Number of threads for parallel computation.
#'   Default is 1.
#' @param verbose Logical. Whether to print progress. Default is FALSE.
#'
#' @return Binary matrix with same dimensions as input.
#'
#' @details
#' \strong{K-means method:}
#' For each gene, k-means clustering with k=2 is applied. The cluster with
#' higher mean expression is labeled as 1, the other as 0.
#'
#' \strong{Rank method:}
#' For each gene, spots are ranked by expression. The top \code{rank_percent}
#' percent are labeled as 1.
#'
#' @examples
#' # Create example expression matrix
#' expr <- matrix(rpois(1000, lambda = 10), nrow = 10, ncol = 100)
#' rownames(expr) <- paste0("gene_", 1:10)
#'
#' # Binarize using k-means
#' bin_kmeans <- binarize_expression(expr, method = "kmeans")
#'
#' # Binarize using rank (top 20%)
#' bin_rank <- binarize_expression(expr, method = "rank", rank_percent = 20)
#'
#' @export
binarize_expression <- function(expr_matrix,
                                 method = c("kmeans", "rank", "median", "mean"),
                                 rank_percent = 30,
                                 n_threads = 1L,
                                 verbose = FALSE) {

    method <- match.arg(method)

    if (!is.matrix(expr_matrix)) {
        expr_matrix <- as.matrix(expr_matrix)
    }

    n_genes <- nrow(expr_matrix)

    if (verbose) {
        message(sprintf("Binarizing %d genes using %s method...", n_genes, method))
    }

    # Define binarization function based on method
    binarize_func <- switch(method,
        "kmeans" = function(x) {
            if (sd(x) < 1e-10) {
                return(rep(0, length(x)))
            }
            km <- kmeans(x, centers = 2, nstart = 3, iter.max = 20)
            cluster_means <- tapply(x, km$cluster, mean)
            high_cluster <- which.max(cluster_means)
            as.integer(km$cluster == high_cluster)
        },
        "rank" = function(x) {
            n <- length(x)
            # Top rank_percent% should be "high" (1)
            threshold <- ceiling(n * rank_percent / 100)
            # rank(-x) gives rank 1 to highest value
            ranks <- rank(-x, ties.method = "first")
            as.integer(ranks <= threshold)
        },
        "median" = function(x) {
            as.integer(x > median(x))
        },
        "mean" = function(x) {
            as.integer(x > mean(x))
        }
    )

    # Apply binarization
    if (n_threads > 1 && requireNamespace("parallel", quietly = TRUE)) {
        bin_matrix <- do.call(rbind, parallel::mclapply(
            seq_len(n_genes),
            function(i) binarize_func(expr_matrix[i, ]),
            mc.cores = n_threads
        ))
    } else {
        bin_matrix <- t(apply(expr_matrix, 1, binarize_func))
    }

    # Restore row/column names
    rownames(bin_matrix) <- rownames(expr_matrix)
    colnames(bin_matrix) <- colnames(expr_matrix)

    return(bin_matrix)
}
