This vignette showcases tidylearn’s capability to seamlessly integrate supervised and unsupervised learning in multi-step workflows.
Important: These integration functions orchestrate
the wrapped packages (stats, glmnet, randomForest, cluster, etc.) rather
than implementing new algorithms. tidylearn provides the workflow
coordination, while the underlying packages do the computational work.
Access raw model objects via model$fit.
Use PCA or MDS to reduce feature space before supervised learning.
# Reduce dimensions before classification
reduced <- tl_reduce_dimensions(iris,
response = "Species",
method = "pca",
n_components = 3)
# Inspect reduced data
head(reduced$data)
#> # A tibble: 6 × 5
#> .obs_id PC1 PC2 PC3 Species
#> <chr> <dbl> <dbl> <dbl> <fct>
#> 1 1 -2.26 -0.478 0.127 setosa
#> 2 2 -2.07 0.672 0.234 setosa
#> 3 3 -2.36 0.341 -0.0441 setosa
#> 4 4 -2.29 0.595 -0.0910 setosa
#> 5 5 -2.38 -0.645 -0.0157 setosa
#> 6 6 -2.07 -1.48 -0.0269 setosa# Train classifier on reduced features (remove .obs_id column first)
reduced_data <- reduced$data %>% select(-starts_with(".obs"))
model_reduced <- tl_model(reduced_data, Species ~ ., method = "logistic")
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(model_reduced)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: logistic
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 150# Split data for fair comparison
split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)
# Model with original features
model_original <- tl_model(split$train, Species ~ ., method = "logistic")
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
preds_original <- predict(model_original, new_data = split$test)
acc_original <- mean(preds_original$.pred == split$test$Species)
# Model with PCA features
reduced_train <- tl_reduce_dimensions(split$train,
response = "Species",
method = "pca",
n_components = 3)
# Remove .obs_id column before modeling (it's just an identifier)
reduced_train_data <- reduced_train$data %>% select(-starts_with(".obs"))
model_pca <- tl_model(reduced_train_data, Species ~ ., method = "logistic")
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Need to transform test data using same PCA
test_predictors <- split$test %>% select(-Species)
test_transformed <- predict(reduced_train$reduction_model, new_data = test_predictors)
test_transformed$Species <- split$test$Species
test_transformed <- test_transformed %>% select(-starts_with(".obs"))
preds_pca <- predict(model_pca, new_data = test_transformed)
acc_pca <- mean(preds_pca$.pred == split$test$Species)
# Compare results
cat("Original features (4):", round(acc_original * 100, 1), "%\n")
#> Original features (4): 0 %
cat("PCA features (3):", round(acc_pca * 100, 1), "%\n")
#> PCA features (3): 0 %
cat("Feature reduction:", round((1 - 3/4) * 100, 1), "%\n")
#> Feature reduction: 25 %Add cluster assignments as features to boost supervised learning performance.
# Compare models with and without cluster features
split_comp <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42)
# Without cluster features
model_no_cluster <- tl_model(split_comp$train, Species ~ ., method = "forest")
preds_no_cluster <- predict(model_no_cluster, new_data = split_comp$test)
acc_no_cluster <- mean(preds_no_cluster$.pred == split_comp$test$Species)
# With cluster features
train_clustered <- tl_add_cluster_features(split_comp$train,
response = "Species",
method = "kmeans",
k = 3)
model_with_cluster <- tl_model(train_clustered, Species ~ ., method = "forest")
# Need to get cluster model for test data
cluster_model <- attr(train_clustered, "cluster_model")
test_clusters <- predict(cluster_model, new_data = split_comp$test[, -5])
test_clustered <- split_comp$test
test_clustered$cluster_kmeans <- as.factor(test_clusters$cluster)
preds_with_cluster <- predict(model_with_cluster, new_data = test_clustered)
acc_with_cluster <- mean(preds_with_cluster$.pred == split_comp$test$Species)
cat("Without cluster features:", round(acc_no_cluster * 100, 1), "%\n")
#> Without cluster features: 93.3 %
cat("With cluster features:", round(acc_with_cluster * 100, 1), "%\n")
#> With cluster features: 93.3 %Train models with limited labels using cluster-based label propagation.
# Use only 10% of labels
set.seed(123)
labeled_indices <- sample(nrow(iris), size = 15) # Only 15 out of 150 labeled!
# Train semi-supervised model
model_semi <- tl_semisupervised(iris, Species ~ .,
labeled_indices = labeled_indices,
cluster_method = "kmeans",
supervised_method = "logistic")
#> Warning: glm.fit: algorithm did not converge
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(model_semi)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: logistic
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 150# Check how labels were propagated
label_mapping <- model_semi$semisupervised_info$label_mapping
print(label_mapping)
#> # A tibble: 3 × 2
#> cluster cluster_label
#> <int> <chr>
#> 1 1 virginica
#> 2 2 versicolor
#> 3 3 setosa# Evaluate performance
preds_semi <- predict(model_semi)
accuracy_semi <- mean(preds_semi$.pred == iris$Species)
cat("Accuracy with only", length(labeled_indices), "labels:",
round(accuracy_semi * 100, 1), "%\n")
#> Accuracy with only 15 labels: 0 %
cat("Proportion of data labeled:", round(length(labeled_indices)/nrow(iris) * 100, 1), "%\n")
#> Proportion of data labeled: 10 %# Fully supervised with same amount of data
labeled_data <- iris[labeled_indices, ]
model_full <- tl_model(labeled_data, Species ~ ., method = "logistic")
#> Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
preds_full <- predict(model_full, new_data = iris)
accuracy_full <- mean(preds_full$.pred == iris$Species)
cat("Fully supervised (15 samples):", round(accuracy_full * 100, 1), "%\n")
#> Fully supervised (15 samples): 0 %
cat("Semi-supervised (15 labels + propagation):", round(accuracy_semi * 100, 1), "%\n")
#> Semi-supervised (15 labels + propagation): 0 %Detect and handle outliers before supervised learning.
Create cluster-specific models for heterogeneous data.
# Train separate models for different clusters
stratified_models <- tl_stratified_models(mtcars, mpg ~ .,
cluster_method = "kmeans",
k = 3,
supervised_method = "linear")
#> Warning in tl_model_supervised(data, formula, method, ...): Response appears to
#> be categorical but is stored as numeric. Consider converting to factor.
#> Warning in tl_model_supervised(data, formula, method, ...): Response appears to
#> be categorical but is stored as numeric. Consider converting to factor.
# Check structure
names(stratified_models)
#> [1] "cluster_model" "supervised_models" "formula"
#> [4] "data"
length(stratified_models$supervised_models)
#> [1] 3# Predictions using stratified models
preds_stratified <- predict(stratified_models)
head(preds_stratified)
#> # A tibble: 6 × 2
#> .pred .cluster
#> <dbl> <int>
#> 1 20.7 2
#> 2 20.5 2
#> 3 24.7 2
#> 4 21.4 1
#> 5 19.2 3
#> 6 18.1 1# Calculate RMSE
rmse_stratified <- sqrt(mean((preds_stratified$.pred - mtcars$mpg)^2))
cat("Stratified Model RMSE:", round(rmse_stratified, 2), "\n")
#> Stratified Model RMSE: 1.06
# Compare with single model
model_single <- tl_model(mtcars, mpg ~ ., method = "linear")
preds_single <- predict(model_single)
rmse_single <- sqrt(mean((preds_single$.pred - mtcars$mpg)^2))
cat("Single Model RMSE:", round(rmse_single, 2), "\n")
#> Single Model RMSE: 2.15Combining multiple integration techniques:
# Step 1: Split data
workflow_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 42)
# Step 2: Reduce dimensions
workflow_reduced <- tl_reduce_dimensions(workflow_split$train,
response = "Species",
method = "pca",
n_components = 3)
# Step 3: Add cluster features to reduced data (remove .obs_id first)
workflow_reduced_clean <- workflow_reduced$data %>% select(-starts_with(".obs"))
workflow_clustered <- tl_add_cluster_features(workflow_reduced_clean,
response = "Species",
method = "kmeans",
k = 3)
# Step 4: Train final model
workflow_model <- tl_model(workflow_clustered, Species ~ ., method = "forest")
print(workflow_model)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: forest
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 105# Transform test data through same pipeline
# 1. Apply PCA transformation
test_pca <- predict(workflow_reduced$reduction_model,
new_data = workflow_split$test[, -5])
test_pca$Species <- workflow_split$test$Species
# 2. Get cluster assignments
cluster_model_wf <- attr(workflow_clustered, "cluster_model")
test_clusters_wf <- predict(cluster_model_wf,
new_data = test_pca[, grep("PC", names(test_pca))])
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
#> Warning in x - c: longer object length is not a multiple of shorter object
#> length
test_pca$cluster_kmeans <- as.factor(test_clusters_wf$cluster)
# 3. Predict
workflow_preds <- predict(workflow_model, new_data = test_pca)
workflow_accuracy <- mean(workflow_preds$.pred == workflow_split$test$Species)
cat("Complete Workflow Accuracy:", round(workflow_accuracy * 100, 1), "%\n")
#> Complete Workflow Accuracy: 88.9 %# Simulate credit data
set.seed(42)
n <- 500
credit_data <- data.frame(
age = rnorm(n, 40, 12),
income = rnorm(n, 50000, 20000),
debt_ratio = runif(n, 0, 0.5),
credit_score = rnorm(n, 700, 100),
years_employed = rpois(n, 5)
)
# Create target variable (default risk)
credit_data$default <- factor(
ifelse(credit_data$debt_ratio > 0.4 & credit_data$credit_score < 650, "Yes", "No")
)
# Split data
credit_split <- tl_split(credit_data, prop = 0.7, stratify = "default", seed = 123)# Strategy 1: Add customer segments as features
credit_clustered <- tl_add_cluster_features(credit_split$train,
response = "default",
method = "kmeans",
k = 4)
model_credit <- tl_model(credit_clustered, default ~ ., method = "forest")
# Transform test data
cluster_model_credit <- attr(credit_clustered, "cluster_model")
test_clusters_credit <- predict(cluster_model_credit,
new_data = credit_split$test[, -6])
test_credit <- credit_split$test
test_credit$cluster_kmeans <- as.factor(test_clusters_credit$cluster)
preds_credit <- predict(model_credit, new_data = test_credit)
accuracy_credit <- mean(preds_credit$.pred == credit_split$test$default)
cat("Credit Risk Model Accuracy:", round(accuracy_credit * 100, 1), "%\n")
#> Credit Risk Model Accuracy: 98.7 %tidylearn’s integration functions provide powerful ways to combine supervised and unsupervised learning:
tl_reduce_dimensions(): Use PCA/MDS as
preprocessingtl_add_cluster_features(): Engineer
features from clusterstl_semisupervised(): Train with
limited labelstl_anomaly_aware(): Handle outliers
intelligentlytl_stratified_models(): Create
cluster-specific modelsThese functions orchestrate the underlying packages (stats, cluster, glmnet, randomForest, etc.) to enable multi-step workflows with a consistent interface.
# Final integrated example
final_data <- iris
final_split <- tl_split(final_data, prop = 0.7, stratify = "Species", seed = 999)
# Combine PCA + clustering
final_reduced <- tl_reduce_dimensions(final_split$train,
response = "Species",
method = "pca",
n_components = 3)
# Remove .obs_id column before clustering
final_reduced_clean <- final_reduced$data %>% select(-starts_with(".obs"))
final_clustered <- tl_add_cluster_features(final_reduced_clean,
response = "Species",
method = "kmeans",
k = 3)
final_model <- tl_model(final_clustered, Species ~ ., method = "logistic")
#> Warning: glm.fit: algorithm did not converge
cat("Final integrated model created successfully!\n")
#> Final integrated model created successfully!
print(final_model)
#> tidylearn Model
#> ===============
#> Paradigm: supervised
#> Method: logistic
#> Task: Classification
#> Formula: Species ~ .
#>
#> Training observations: 105