Getting Started with tidylearn

Introduction

tidylearn provides a unified tidyverse-compatible interface to R’s machine learning ecosystem. It wraps proven packages like glmnet, randomForest, xgboost, e1071, cluster, and dbscan - you get the reliability of established implementations with the convenience of a consistent, tidy API.

What tidylearn does:

What tidylearn is NOT:

Installation

# Install from CRAN
install.packages("tidylearn")

# Or install development version from GitHub
# devtools::install_github("ces0491/tidylearn")
library(tidylearn)
library(dplyr)

The Unified Interface

The core of tidylearn is the tl_model() function, which dispatches to the appropriate underlying package based on the method you specify. The wrapped packages include stats, glmnet, randomForest, xgboost, gbm, e1071, nnet, rpart, cluster, and dbscan.

Supervised Learning

Classification

# Classification with logistic regression
model_logistic <- tl_model(iris, Species ~ ., method = "logistic")
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(model_logistic)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: logistic 
#> Task: Classification 
#> Formula: Species ~ . 
#> 
#> Training observations: 150
# Make predictions
predictions <- predict(model_logistic)
head(predictions)
#> # A tibble: 6 × 1
#>      .pred
#>      <dbl>
#> 1 2.22e-16
#> 2 2.22e-16
#> 3 2.22e-16
#> 4 1.88e-12
#> 5 2.22e-16
#> 6 2.22e-16

Regression

# Regression with linear model
model_linear <- tl_model(mtcars, mpg ~ wt + hp, method = "linear")
print(model_linear)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: linear 
#> Task: Regression 
#> Formula: mpg ~ wt + hp 
#> 
#> Training observations: 32
# Predictions
predictions_reg <- predict(model_linear)
head(predictions_reg)
#> # A tibble: 6 × 1
#>   .pred
#>   <dbl>
#> 1  23.6
#> 2  22.6
#> 3  25.3
#> 4  21.3
#> 5  18.3
#> 6  20.5

Unsupervised Learning

Dimensionality Reduction

# Principal Component Analysis
model_pca <- tl_model(iris[, 1:4], method = "pca")
print(model_pca)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: pca 
#> Technique: pca 
#> 
#> Training observations: 150
# Transform data
transformed <- predict(model_pca)
head(transformed)
#> # A tibble: 6 × 5
#>   .obs_id   PC1    PC2     PC3      PC4
#>   <chr>   <dbl>  <dbl>   <dbl>    <dbl>
#> 1 1       -2.26 -0.478  0.127   0.0241 
#> 2 2       -2.07  0.672  0.234   0.103  
#> 3 3       -2.36  0.341 -0.0441  0.0283 
#> 4 4       -2.29  0.595 -0.0910 -0.0657 
#> 5 5       -2.38 -0.645 -0.0157 -0.0358 
#> 6 6       -2.07 -1.48  -0.0269  0.00659

Clustering

# K-means clustering
model_kmeans <- tl_model(iris[, 1:4], method = "kmeans", k = 3)
print(model_kmeans)
#> tidylearn Model
#> ===============
#> Paradigm: unsupervised 
#> Method: kmeans 
#> Technique: kmeans 
#> 
#> Training observations: 150
# Get cluster assignments
clusters <- model_kmeans$fit$clusters
head(clusters)
#> # A tibble: 6 × 2
#>   .obs_id cluster
#>   <chr>     <int>
#> 1 1             3
#> 2 2             3
#> 3 3             3
#> 4 4             3
#> 5 5             3
#> 6 6             3
# Compare with actual species
table(clusters$cluster, iris$Species)
#>    
#>     setosa versicolor virginica
#>   1      0          2        36
#>   2      0         48        14
#>   3     50          0         0

Data Preprocessing

tidylearn provides comprehensive preprocessing functions:

# Prepare data with multiple preprocessing steps
processed <- tl_prepare_data(
  iris,
  Species ~ .,
  impute_method = "mean",
  scale_method = "standardize",
  encode_categorical = FALSE
)
#> Scaling numeric features using method: standardize
# Check preprocessing steps applied
names(processed$preprocessing_steps)
#> [1] "scaling"
# Use processed data for modeling
model_processed <- tl_model(processed$data, Species ~ ., method = "forest")

Train-Test Splitting

# Simple random split
split <- tl_split(iris, prop = 0.7, seed = 123)

# Train model
model_train <- tl_model(split$train, Species ~ ., method = "logistic")
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# Test predictions
predictions_test <- predict(model_train, new_data = split$test)
head(predictions_test)
#> # A tibble: 6 × 1
#>      .pred
#>      <dbl>
#> 1 2.22e-16
#> 2 2.22e-16
#> 3 2.22e-16
#> 4 2.22e-16
#> 5 2.22e-16
#> 6 2.22e-16
# Stratified split (maintains class proportions)
split_strat <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)

# Check proportions are maintained
prop.table(table(split_strat$train$Species))
#> 
#>     setosa versicolor  virginica 
#>  0.3333333  0.3333333  0.3333333
prop.table(table(split_strat$test$Species))
#> 
#>     setosa versicolor  virginica 
#>  0.3333333  0.3333333  0.3333333
prop.table(table(iris$Species))
#> 
#>     setosa versicolor  virginica 
#>  0.3333333  0.3333333  0.3333333

Wrapped Packages

tidylearn provides a unified interface to these established R packages:

Supervised Methods

Method Underlying Package Function Called
"linear" stats lm()
"polynomial" stats lm() with poly()
"logistic" stats glm(..., family = binomial)
"ridge", "lasso", "elastic_net" glmnet glmnet()
"tree" rpart rpart()
"forest" randomForest randomForest()
"boost" gbm gbm()
"xgboost" xgboost xgb.train()
"svm" e1071 svm()
"nn" nnet nnet()
"deep" keras keras_model_sequential()

Unsupervised Methods

Method Underlying Package Function Called
"pca" stats prcomp()
"mds" stats, MASS, smacof cmdscale(), isoMDS(), etc.
"kmeans" stats kmeans()
"pam" cluster pam()
"clara" cluster clara()
"hclust" stats hclust()
"dbscan" dbscan dbscan()

Accessing the Underlying Model

You always have access to the raw model from the underlying package via $fit:

# Example: Access the raw randomForest object
model_forest <- tl_model(iris, Species ~ ., method = "forest")
class(model_forest$fit)  # This is the randomForest object
#> [1] "randomForest.formula" "randomForest"

# Use package-specific functions if needed
# randomForest::varImpPlot(model_forest$fit)

Next Steps

Now that you understand the basics, explore:

  1. Supervised Learning - Dive deeper into classification and regression
  2. Unsupervised Learning - Explore clustering and dimensionality reduction
  3. Integration Workflows - Combine supervised and unsupervised learning
  4. AutoML - Automated machine learning with tl_auto_ml()

Summary

tidylearn is a wrapper package that provides:

The underlying algorithms are unchanged - tidylearn simply makes them easier to use together.

# Quick example combining everything
data_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42)
data_prep <- tl_prepare_data(data_split$train, Species ~ ., scale_method = "standardize")
#> Scaling numeric features using method: standardize
model_final <- tl_model(data_prep$data, Species ~ ., method = "forest")
test_preds <- predict(model_final, new_data = data_split$test)

print(model_final)
#> tidylearn Model
#> ===============
#> Paradigm: supervised 
#> Method: forest 
#> Task: Classification 
#> Formula: Species ~ . 
#> 
#> Training observations: 105