#' Calculate Perplexity for Different Topic Numbers
#'
#' Calculates the perplexity of LDA models over a range of K topics to help
#' determine the optimal number of topics.
#'
#' @param text_vector A character vector of aggregated student responses (length N).
#' @param k_range A numeric vector indicating the number of topics to try (e.g., 2:10).
#' @param seed Integer seed for reproducibility.
#'
#' @return A data frame containing K and the corresponding perplexity score.
#' @importFrom topicmodels LDA perplexity
#' @export
ttm_perplexity <- function(text_vector, k_range = 2:5, seed = 1234) {
  dtm <- clean_and_dtm(text_vector)
  
  results <- data.frame(k = integer(), perplexity = numeric())
  
  message("Calculating perplexity...")
  for (k in k_range) {
    message(paste("  Fitting LDA with k =", k))
    # Using VEM method for speed/consistency with paper context
    model <- topicmodels::LDA(dtm, k = k, method = "VEM", control = list(seed = seed))
    perp <- topicmodels::perplexity(model)
    results <- rbind(results, data.frame(k = k, perplexity = perp))
  }
  
  return(results)
}

#' Fit LDA and Extract Topic Proportions
#'
#' Fits a Latent Dirichlet Allocation model to the text and returns the 
#' person-specific topic proportion matrix (delta).
#'
#' @param text_vector A character vector of aggregated student responses.
#' @param k The number of latent topics.
#' @param seed Integer seed for reproducibility.
#'
#' @return A matrix of dimension N x K containing topic proportions (delta).
#' @importFrom topicmodels LDA posterior
#' @export
ttm_lda <- function(text_vector, k, seed = 1234) {
  # Get the DTM (now using VCorpus)
  dtm <- clean_and_dtm(text_vector)
  
  # Find empty documents (students who wrote nothing or only stopwords)
  row_totals <- apply(dtm, 1, sum)
  valid_indices <- which(row_totals > 0)
  empty_indices <- which(row_totals == 0)
  
  if (length(empty_indices) > 0) {
    message(paste("Warning:", length(empty_indices), "students had empty/stopword-only responses. They will be assigned uniform topic probabilities."))
  }
  
  # Subset only valid data for LDA fitting
  dtm_valid <- dtm[valid_indices, ]
  
  message(paste("Fitting LDA with k =", k))
  lda_model <- topicmodels::LDA(dtm_valid, k = k, method = "VEM", control = list(seed = seed))
  
  # Extract posterior for valid students
  post <- topicmodels::posterior(lda_model)
  valid_delta <- post$topics
  
  # Reconstruct the full N x K matrix to match the original students
  full_delta <- matrix(0, nrow = length(text_vector), ncol = k)
  
  # Fill in valid students
  full_delta[valid_indices, ] <- valid_delta
  
  # Fill in empty students with Uniform distribution (1/K)
  # (Since they wrote nothing, we assume equal likelihood of any topic)
  if (length(empty_indices) > 0) {
    full_delta[empty_indices, ] <- 1/k
  }
  
  return(full_delta)
}