#' Pre-whiten count series with GLM / NegBin model
#'
#' Fits a generalized linear model for count data using either a
#' negative binomial model with log link and offset, or a Poisson
#' fallback, and returns Pearson residuals to be used as a
#' pre-whitened series.
#'
#' @param DT A \code{data.frame} or \code{data.table} containing the
#'   response and covariates. It must include at least:
#'   \itemize{
#'     \item The count variable named by \code{yname}.
#'     \item \code{t_norm}: normalized time index.
#'     \item \code{Regime}, \code{EconCycle}, \code{PopDensity},
#'           \code{Epidemics}, \code{Climate}, \code{War}.
#'     \item \code{log_exposure50}: log exposure (offset).
#'   }
#' @param yname Character scalar; name of the count response column in
#'   \code{DT}.
#'
#' @details
#' The function first attempts to fit a negative binomial GLM
#' via \code{MASS::glm.nb()} with a log link and \code{log_exposure50}
#' as an offset. If the fit fails (e.g., due to convergence issues),
#' it falls back to a Poisson GLM via \code{glm(family = poisson())}
#' with the same formula and offset.
#'
#' @return A numeric vector of Pearson residuals (one per row in \code{DT}
#'   used in the fit).
#'
#' @examples
#' \donttest{
#' if (interactive()) {
#'   n <- 100
#'   DT <- data.frame(
#'     t_norm = seq_len(n) / n,
#'     I = rpois(n, 5),
#'     Regime = factor(sample(c("A","B"), n, TRUE)),
#'     EconCycle = rnorm(n), PopDensity = runif(n),
#'     Epidemics = rbinom(n, 1, 0.1), Climate = rnorm(n), War = rbinom(n, 1, 0.05),
#'     log_exposure50 = log(runif(n, 40, 60))
#'   )
#'   r_I <- prewhiten_count_glm(DT, "I")
#'   head(r_I)
#' }
#' }
#' @export

prewhiten_count_glm <- function(DT, yname) {
  fm <- as.formula(paste0(
    yname, " ~ t_norm + I(t_norm^2) + Regime + ",
    "EconCycle + PopDensity + Epidemics + Climate + War"
  ))
  off <- DT$log_exposure50
  fit <- try(MASS::glm.nb(fm, data = DT, offset = off), silent = TRUE)
  if (inherits(fit, "try-error")) fit <- glm(fm, data = DT, offset = off, family = poisson())
  resid(fit, type = "pearson")
}

#' Pre-whiten rate series with log-link Gaussian GLM
#'
#' Fits a Gaussian GLM with log link to a rate variable (count/exposure)
#' without offset, applying a small lower bound to avoid zeros, and
#' returns Pearson residuals as a pre-whitened series.
#'
#' @param DT A \code{data.frame} or \code{data.table} containing the
#'   rate variable and covariates. It must include at least:
#'   \itemize{
#'     \item The rate variable named by \code{yname}.
#'     \item \code{t_norm}: normalized time index.
#'     \item \code{Regime}, \code{EconCycle}, \code{PopDensity},
#'           \code{Epidemics}, \code{Climate}, \code{War}.
#'   }
#' @param yname Character scalar; name of the rate response column in
#'   \code{DT}.
#'
#' @details
#' The response \code{y} is first sanitized via
#' \code{y_safe <- pmax(y, 1e-8)} to avoid taking logs of zero. The
#' model is then fit with \code{glm(family = gaussian(link = "log"))}.
#'
#' @return A numeric vector of Pearson residuals (one per row in \code{DT}
#'   used in the fit).
#'
#' @examples
#' \donttest{
#' if (interactive()) {
#'   n <- 100
#'   DT <- data.frame(
#'     t_norm = seq_len(n) / n,
#'     I_rate = rgamma(n, 2, 1),
#'     Regime = factor(sample(c("A","B"), n, TRUE)),
#'     EconCycle = rnorm(n), PopDensity = runif(n),
#'     Epidemics = rbinom(n, 1, 0.1), Climate = rnorm(n), War = rbinom(n, 1, 0.05)
#'   )
#'   r_I_rate <- prewhiten_rate_glm(DT, "I_rate")
#'   head(r_I_rate)
#' }
#' }
#' @export

prewhiten_rate_glm <- function(DT, yname) {
  fm <- as.formula(paste0(
    yname, " ~ t_norm + I(t_norm^2) + Regime + ",
    "EconCycle + PopDensity + Epidemics + Climate + War"
  ))
  y <- DT[[yname]]
  y_safe <- pmax(y, 1e-8)
  DTx <- data.table::copy(DT); DTx[[yname]] <- y_safe
  fit <- glm(fm, data = DTx, family = gaussian(link = "log"))
  resid(fit, type = "pearson")
}

#' Pre-whiten binary series with logistic GLM
#'
#' Fits a logistic regression (binomial GLM with logit link) to a binary
#' 0/1 response and returns Pearson residuals as a pre-whitened series.
#'
#' @param DT A \code{data.frame} or \code{data.table} containing the
#'   binary response and covariates. It must include at least:
#'   \itemize{
#'     \item The binary variable named by \code{yname} (values 0/1).
#'     \item \code{t_norm}: normalized time index.
#'     \item \code{Regime}, \code{EconCycle}, \code{PopDensity},
#'           \code{Epidemics}, \code{Climate}, \code{War}.
#'   }
#' @param yname Character scalar; name of the binary response column in
#'   \code{DT}. The function checks that all values are in \code{c(0, 1)}
#'   and stops otherwise.
#'
#' @return A numeric vector of Pearson residuals (one per row in \code{DT}
#'   used in the fit).
#'
#' @examples
#' \donttest{
#' if (interactive()) {
#'   n <- 100
#'   DT <- data.frame(
#'     t_norm = seq_len(n) / n,
#'     I_zero = rbinom(n, 1, 0.3),
#'     Regime = factor(sample(c("A","B"), n, TRUE)),
#'     EconCycle = rnorm(n), PopDensity = runif(n),
#'     Epidemics = rbinom(n, 1, 0.1), Climate = rnorm(n), War = rbinom(n, 1, 0.05)
#'   )
#'   r_I_zero <- prewhiten_bin_glm(DT, "I_zero")
#'   head(r_I_zero)
#' }
#' }
#' @export

prewhiten_bin_glm <- function(DT, yname) {
  fm <- as.formula(paste0(
    yname, " ~ t_norm + I(t_norm^2) + Regime + ",
    "EconCycle + PopDensity + Epidemics + Climate + War"
  ))
  y <- DT[[yname]]
  if (!all(y %in% c(0,1))) stop("prewhiten_bin_glm: la variable no es binaria 0/1.")
  fit <- glm(fm, data = DT, family = binomial())
  resid(fit, type = "pearson")
}

#' Coerce to numeric and return first element
#'
#' Helper to safely coerce an object to numeric and return the first
#' element, or \code{NA_real_} if empty. Used internally when parsing
#' RTransferEntropy-style output tables.
#'
#' @param z An object to be coerced to numeric.
#'
#' @return A numeric scalar (first element of \code{as.numeric(z)}) or
#'   \code{NA_real_} if conversion fails or the result is empty.
#'
#' @keywords internal

.as_num1 <- function(z){ z <- suppressWarnings(as.numeric(z)); if (length(z)) z[1] else NA_real_ }

#' Safely extract coefficient matrix from an object
#'
#' Helper to call \code{coef()} on an object and return the result as a
#' matrix, or \code{NULL} if \code{coef()} errors or does not return a
#' matrix. Intended for objects produced by RTransferEntropy.
#'
#' @param obj An object with a \code{coef()} method.
#'
#' @return A numeric matrix of coefficients, or \code{NULL} on failure.
#'
#' @keywords internal

.get_coef <- function(obj){ m <- try(coef(obj), silent=TRUE); if (inherits(m,"try-error") || !is.matrix(m)) NULL else m }

#' Extract TE statistic from RTransferEntropy result
#'
#' Helper to extract a single transfer-entropy-like statistic from a
#' coefficient table. It looks for columns named \code{"Eff. TE"} or
#' \code{"TE"} (in that order) and falls back to the first column if
#' neither is present.
#'
#' @param obj An object produced by RTransferEntropy (or similar) for
#'   which \code{coef(obj)} returns a matrix.
#'
#' @return A numeric scalar with the extracted statistic (first row of
#'   the chosen column), or \code{NA_real_} if extraction fails.
#'
#' @keywords internal

.get_stat <- function(obj){
  m <- .get_coef(obj); if (is.null(m)) return(NA_real_)
  col <- if ("Eff. TE" %in% colnames(m)) "Eff. TE" else if ("TE" %in% colnames(m)) "TE" else colnames(m)[1]
  .as_num1(m[1, col])
}

#' Extract p-value from RTransferEntropy result
#'
#' Helper to extract a p-value from a coefficient table returned by
#' RTransferEntropy or similar packages. It searches for a column whose
#' name matches \code{"^p[._ -]?value$"} (case-insensitive) and returns
#' the first-row entry of that column.
#'
#' @param obj An object for which \code{coef(obj)} returns a matrix
#'   containing a p-value column.
#'
#' @return A numeric scalar with the extracted p-value, or
#'   \code{NA_real_} if no suitable column is found or extraction fails.
#'
#' @keywords internal

.get_pval <- function(obj){
  m <- .get_coef(obj); if (is.null(m)) return(NA_real_)
  pvcol <- grep("^p[._ -]?value$", colnames(m), ignore.case=TRUE, value=TRUE)
  if (length(pvcol)) return(.as_num1(m[1, pvcol[1]]))
  NA_real_
}

#' Transfer Entropy for Counts, Rates, and Binary Series
#'
#' Computes pairwise transfer entropy between \code{I} and \code{C} for
#' three transformations of the data: raw counts, rates (count/exposure),
#' and binary presence/absence. Each series is first pre-whitened via a
#' GLM and transfer entropy is then estimated for a grid of lags using
#' \pkg{RTransferEntropy}. Results are written to separate CSV files and
#' to a combined summary.
#'
#' @param DT A \code{data.table} or \code{data.frame} containing at least
#'   the following columns:
#'   \itemize{
#'     \item \code{I}, \code{C}: count variables (non-negative integers).
#'     \item \code{exposure50}: exposure used to form rates (must be
#'       strictly positive).
#'     \item \code{log_exposure50}: log of the exposure (offset).
#'     \item \code{t_norm}, \code{Regime}, \code{EconCycle},
#'           \code{PopDensity}, \code{Epidemics}, \code{Climate}, \code{War}:
#'           covariates used by the pre-whitening GLMs.
#'   }
#' @param lags Integer vector of lag orders \code{L} for which transfer
#'   entropy is computed (passed to \code{lx} and \code{ly} in
#'   \code{RTransferEntropy::transfer_entropy()}).
#' @param shuffles Integer; number of shuffle replications for the
#'   surrogate-distribution-based significance test in
#'   \code{transfer_entropy()}.
#' @param seed Integer; base random seed used for reproducibility of the
#'   pre-whitening and transfer entropy computations.
#' @param use_progress Logical; reserved for future use to toggle progress
#'   reporting. Currently not used.
#'
#' @details
#' The function proceeds in four steps:
#' \enumerate{
#'   \item \strong{Counts}: \code{I} and \code{C} are pre-whitened via
#'     \code{\link{prewhiten_count_glm}} (Negative Binomial with offset and
#'     Poisson fallback). Transfer entropy is computed in both directions
#'     (\code{I→C} and \code{C→I}) for each lag in \code{lags}. Results are
#'     saved to \code{"transfer_entropy_counts.csv"}.
#'   \item \strong{Rates}: \code{I} and \code{C} are divided by
#'     \code{exposure50}, pre-whitened via
#'     \code{\link{prewhiten_rate_glm}}, and transfer entropy is recomputed.
#'     Results are saved to \code{"transfer_entropy_rates.csv"}. A check is
#'     performed to ensure \code{exposure50 > 0} for all observations.
#'   \item \strong{Binary}: \code{I} and \code{C} are recoded as 0/1
#'     presence/absence indicators and pre-whitened via
#'     \code{\link{prewhiten_bin_glm}}. Transfer entropy is computed again
#'     and results are saved to \code{"transfer_entropy_binary.csv"}.
#'   \item \strong{Combined}: All tables are stacked into a single data
#'     frame with a \code{type} column (\code{"counts"}, \code{"rates"},
#'     \code{"binary"}) and written to \code{"transfer_entropy.csv"}.
#' }
#'
#' Internally, the helpers \code{\link{.get_stat}} and
#' \code{\link{.get_pval}} are used to extract the transfer entropy
#' statistic and p-value from the objects returned by
#' \code{RTransferEntropy::transfer_entropy()}. The function assumes a
#' global \code{dir_csv} object (character scalar) indicating the output
#' directory for CSV files.
#'
#' @return A \code{data.frame} with one row per lag and type, and columns:
#' \itemize{
#'   \item \code{lag}: lag order used in \code{transfer_entropy()}.
#'   \item \code{TE_ItoC}, \code{p_ItoC}: transfer entropy and p-value
#'     from \code{I} to \code{C}.
#'   \item \code{TE_CtoI}, \code{p_CtoI}: transfer entropy and p-value
#'     from \code{C} to \code{I}.
#'   \item \code{type}: transformation used (\code{"counts"},
#'     \code{"rates"}, or \code{"binary"}).
#' }
#'
#' @examples
#' \donttest{
#' library(data.table)
#'
#' # 1. Create dummy data with ALL covariates required by prewhiten_*_glm()
#' # The internal GLM formulas likely include:
#' # I ~ t_norm + Regime + EconCycle + PopDensity + Epidemics + Climate + War
#' DT <- data.table(
#'   year = 2000:2029,
#'   I = rpois(30, lambda = 10),
#'   C = rpois(30, lambda = 8),
#'   exposure50 = runif(30, 100, 200),
#'   log_exposure50 = log(runif(30, 100, 200)),
#'   # Covariates
#'   t_norm = seq(-1, 1, length.out = 30),
#'   Regime = factor(sample(c("A", "B"), 30, replace = TRUE)),
#'   EconCycle = rnorm(30),
#'   PopDensity = rnorm(30),
#'   Epidemics = rnorm(30),
#'   Climate = rnorm(30),
#'   War = rnorm(30)
#' )
#'
#' # 2. Define global paths using tempdir() (Fixes CRAN policy)
#' # run_transfer_entropy writes output to 'dir_csv'
#' tmp_dir <- tempdir()
#' dir_csv <- file.path(tmp_dir, "csv")
#' if (!dir.exists(dir_csv)) dir.create(dir_csv, recursive = TRUE)
#'
#' # 3. Run the function
#' # Using fewer shuffles for a faster example check
#' te_tab <- run_transfer_entropy(DT, lags = 1, shuffles = 10, seed = 123)
#'
#' # Inspect results
#' if (!is.null(te_tab)) {
#'   print(subset(te_tab, type == "counts"))
#' }
#' }
#'
#' @export

run_transfer_entropy <- function(DT, lags = 1:3, shuffles = 1000, seed = 123, use_progress = TRUE) {
  
  set.seed(seed)
  
  Iy_c <- prewhiten_count_glm(DT, "I")
  Cy_c <- prewhiten_count_glm(DT, "C")
  
  do_iter <- function(Iy, Cy, L, shuf, sd0){
    teIC <- try(RTransferEntropy::transfer_entropy(Iy, Cy, lx=L, ly=L, quiet=TRUE, shuffles=shuf, seed=sd0+L), silent=TRUE)
    teCI <- try(RTransferEntropy::transfer_entropy(Cy, Iy, lx=L, ly=L, quiet=TRUE, shuffles=shuf, seed=sd0+L), silent=TRUE)
    data.frame(lag=L, TE_ItoC=.get_stat(teIC), p_ItoC=.get_pval(teIC),
               TE_CtoI=.get_stat(teCI), p_CtoI=.get_pval(teCI))
  }
  
  tab_counts <- dplyr::bind_rows(lapply(lags, do_iter, Iy=Iy_c, Cy=Cy_c, shuf=shuffles, sd0=seed))
  tab_counts$type <- "counts"
  readr::write_csv(tab_counts, file.path(dir_csv, "transfer_entropy_counts.csv"))
  
  if (any(DT$exposure50 <= 0, na.rm = TRUE)) stop("exposure50 <= 0 detected; correct before forming rates.")
  DT_rate <- data.table::copy(DT)
  DT_rate[, `:=`(I = I / exposure50, C = C / exposure50)]
  Iy_r <- prewhiten_rate_glm(DT_rate, "I")
  Cy_r <- prewhiten_rate_glm(DT_rate, "C")
  tab_rates <- dplyr::bind_rows(lapply(lags, do_iter, Iy=Iy_r, Cy=Cy_r, shuf=shuffles, sd0=seed+1000))
  tab_rates$type <- "rates"
  readr::write_csv(tab_rates, file.path(dir_csv, "transfer_entropy_rates.csv"))
  
  DT_bin <- data.table::copy(DT)
  DT_bin[, `:=`(I = as.integer(I > 0), C = as.integer(C > 0))]
  Iy_b <- prewhiten_bin_glm(DT_bin, "I")
  Cy_b <- prewhiten_bin_glm(DT_bin, "C")
  tab_bin <- dplyr::bind_rows(lapply(lags, do_iter, Iy=Iy_b, Cy=Cy_b, shuf=shuffles, sd0=seed+2000))
  tab_bin$type <- "binary"
  readr::write_csv(tab_bin, file.path(dir_csv, "transfer_entropy_binary.csv"))
  
  tab_all <- dplyr::bind_rows(tab_counts, tab_rates, tab_bin)
  readr::write_csv(tab_all, file.path(dir_csv, "transfer_entropy.csv"))
  
  tab_all
}
