3 Step2: Change Training Dataset Size

3.1 Change Training Dataset Size - Random Forest

Code

library(randomForest)

randomForest 4.7-1.2

Type rfNews() to see new features/changes/bug fixes.

Code

library(DALEX)

Welcome to DALEX (version: 2.4.3).
Find examples and detailed introduction at: http://ema.drwhy.ai/

Code

library(iml)
library(mlbench)
library(caret)

Loading required package: ggplot2


Attaching package: 'ggplot2'

The following object is masked from 'package:randomForest':

    margin

Loading required package: lattice

Code

library(dplyr)


Attaching package: 'dplyr'

The following object is masked from 'package:DALEX':

    explain

The following object is masked from 'package:randomForest':

    combine

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Code

data(PimaIndiansDiabetes)
df <- na.omit(PimaIndiansDiabetes)
df$diabetes <- factor(df$diabetes, levels = c("neg", "pos"))  

set.seed(5293)
train_idx <- createDataPartition(df$diabetes, p = 0.8, list = FALSE)
train_data_full <- df[train_idx, ]
test_data <- df[-train_idx, ]
x_test <- test_data[1, -ncol(test_data)]

fractions <- c(0.1, 0.3, 0.5, 1.0)
lime_by_size <- list()
shap_by_size <- list()

Code

for (f in fractions) {
  set.seed(2025)
  n_sample <- floor(nrow(train_data_full) * f)
  sampled_idx <- sample(nrow(train_data_full), n_sample)
  sub_train <- train_data_full[sampled_idx, ]
  
  X_sub <- sub_train[, -ncol(sub_train)]
  y_sub <- sub_train$diabetes

  rf_model <- randomForest(x = X_sub, y = y_sub, ntree = 100)

  # LIME
 explainer <- DALEX::explain(
  model = rf_model,
  data = X_sub,
  y = NULL,  
  predict_function = function(m, d) predict(m, d, type = "prob")[, 2],
  label = paste0("RF_", f*100, "%"),
  verbose = FALSE
)

  lime_expl <- predict_parts(
    explainer,
    new_observation = x_test,
    type = "break_down"
  )
  lime_by_size[[as.character(f)]] <- lime_expl

  # SHAP
  predictor <- Predictor$new(rf_model, data = X_sub, y = y_sub, type = "prob")
  shap <- Shapley$new(predictor, x.interest = x_test)
  shap_by_size[[as.character(f)]] <- shap$results
}

Code

# LIME

predict_function = function(m, d) {
  predict(m, d, type = "prob")[, "pos"]
}


explainer <- DALEX::explain(
  model = rf_model,
  data = X_sub,
  y = NULL,
  predict_function = function(m, d) predict(m, d, type = "prob")[, "pos"],
  label = paste0("RF_", f * 100, "%"),
  verbose = FALSE
)

lime_df <- predict_parts(
  explainer,
  new_observation = x_test,
  type = "break_down"  
)

print(lime_df)

                          contribution
RF_100%: intercept               0.346
RF_100%: glucose = 85           -0.136
RF_100%: mass = 26.6            -0.039
RF_100%: age = 31                0.068
RF_100%: pregnant = 1           -0.013
RF_100%: triceps = 29            0.035
RF_100%: pressure = 66           0.008
RF_100%: pedigree = 0.351        0.009
RF_100%: insulin = 0            -0.028
RF_100%: prediction              0.250

Code

library(dplyr)

lime_all <- purrr::map2_dfr(
  lime_by_size,
  names(lime_by_size),
  ~ .x |>
    filter(variable != "intercept", variable != "", variable != "prediction") |>
    mutate(variable = factor(variable),
           fraction = .y)
)


lime_sd_summary <- lime_all |>
  group_by(variable) |>
  summarise(sd_contribution = sd(contribution), .groups = "drop")


lime_sd_summary |>
  arrange(desc(sd_contribution)) |>
  print()

# A tibble: 8 × 2
  variable         sd_contribution
  <fct>                      <dbl>
1 age = 31                  0.0370
2 pressure = 66             0.0257
3 glucose = 85              0.0242
4 pregnant = 1              0.0236
5 triceps = 29              0.0210
6 mass = 26.6               0.0180
7 pedigree = 0.351          0.0138
8 insulin = 0               0.0133

Code

library(ggplot2)

ggplot(lime_sd_summary, aes(x = reorder(variable, sd_contribution), y = sd_contribution)) +
  geom_col(fill = "grey") +
  coord_flip() +
  labs(
    title = "Stability of LIME Explanations",
    x = "Variable",
    y = "Standard Deviation"
  ) +
  theme_minimal(base_size = 14)

Code

ggplot(lime_all, aes(x = reorder(variable, contribution, FUN = median), y = contribution)) +
  geom_boxplot(fill = "orange", color = "black", outlier.shape = 21) +
  coord_flip() +
  labs(
    title = "Distribution of LIME Contributions",
    x = "Variable",
    y = "Contribution"
  ) +
  theme_minimal(base_size = 14)

Code

# SHAP

library(dplyr)
library(ggplot2)

shap_df <- bind_rows(lapply(names(shap_by_size), function(name) {
  df <- shap_by_size[[name]]
  df$train_size <- name
  return(df)
}))

shap_df_clean <- shap_df |>
  rename(variable = feature,
         contribution = phi) |>
  filter(variable != "intercept", variable != "") |>
  mutate(variable = factor(variable))

shap_sd_summary <- shap_df_clean |>
  group_by(variable) |>
  summarise(sd_contribution = sd(contribution), .groups = "drop")

print(shap_sd_summary |> arrange(desc(sd_contribution)))

# A tibble: 8 × 2
  variable sd_contribution
  <fct>              <dbl>
1 glucose           0.119 
2 age               0.0757
3 pregnant          0.0627
4 mass              0.0581
5 triceps           0.0245
6 pressure          0.0178
7 insulin           0.0142
8 pedigree          0.0135

Code

ggplot(shap_df_clean, aes(x = variable, y = contribution)) +
  geom_boxplot(fill = "pink") +
  labs(title = "Distribution of SHAP Contributions",
       x = "Variable", y = "Contribution") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Code

ggplot(shap_sd_summary, aes(x = reorder(variable, -sd_contribution), y = sd_contribution)) +
  geom_col(fill = "brown") +
  labs(title = "Standard Deviation of SHAP Contributions",
       x = "Variable", y = "Standard Deviation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

3.2 Change Training Dataset Size Analysis - Random Forest

We trained random forest models on 10%, 30%, 50%, and 100% subsets of the PimaIndiansDiabetes dataset. For each subset, LIME and SHAP explanations were generated for the same run, and the standard deviation of contributions was used to quantify explanation variability.

LIME results show that variables such as glucose and mass have the lowest standard deviation, so they are the most stable explanations. However, triceps and pedigree show higher variability across training sizes. This suggests that features with stronger signal or clearer influence on the prediction remain stable even when trained on smaller datasets. SHAP results similar trends but exhibit overall greater variability. Notably, the glucose feature shows the highest standard deviation in SHAP, indicating that its attribution is more sensitive to the training data size.

In conclusion, increasing the size of training data generally leads to more stable and consistent feature attributions.

3.3 Change Training Dataset Size - Logistic Regression

We examined how the size of the training dataset affects the variability of LIME explanations, particularly across different feature-value pairs (e.g., glucose = 101, mass = 31.4). The bar chart displays the standard deviation (SD) of LIME contributions at four training sizes: 10%, 30%, 50%, and 100%.

Code

model_type.glm <- function(x, ...) "classification"
predict_model.glm <- function(x, newdata, ...) {
  preds <- predict(x, newdata, type = "response")
  data.frame(`No` = 1 - preds, `Yes` = preds)
}


data(PimaIndiansDiabetes)
df <- na.omit(PimaIndiansDiabetes)
set.seed(5293)
df$diabetes <- factor(df$diabetes)
X <- df[, -ncol(df)]
y <- df$diabetes

lime_contributions <- list()
shap_contributions <- list()


train_idx <- createDataPartition(df$diabetes, p = 0.8, list = FALSE)
train_data <- df[train_idx, ]
test_data <- df[-train_idx, ]

Code

sizes <- c(0.1, 0.3, 0.5, 1.0)
n_obs <- 5  
n_repeats <- 5 

size_results <- list()

for (size in sizes) {
  for (rep in 1:n_repeats) {
    
    if (size == 1.0) {
      sub_train <- train_data
    } else {
      sub_idx <- sample(nrow(train_data), size = floor(nrow(train_data) * size))
      sub_train <- train_data[sub_idx, ]
    }

    logit_model <- glm(diabetes ~ ., data = sub_train, family = binomial)

    explainer <- DALEX::explain(
      logit_model,
      data = sub_train[, -ncol(sub_train)],
      y = as.numeric(sub_train$diabetes == "pos"),
      label = paste0("glm_size_", size)
    )

    for (i in 1:n_obs) {
      lime_expl <- predict_parts(
        explainer,
        new_observation = test_data[i, -ncol(test_data)],
        type = "break_down"
      )
      lime_expl$size <- as.character(size)
      lime_expl$rep <- rep
      lime_expl$obs <- i
      size_results[[length(size_results) + 1]] <- lime_expl
    }
  }
}

Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.1 
  -> data              :  61  rows  8  cols 
  -> target variable   :  61  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.01586211 , mean =  0.3442623 , max =  0.9887167  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.7908998 , mean =  -1.719139e-11 , max =  0.8439318  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.1 
  -> data              :  61  rows  8  cols 
  -> target variable   :  61  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.001370974 , mean =  0.4262295 , max =  0.9975288  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.7311193 , mean =  4.181597e-13 , max =  0.8912169  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.1 
  -> data              :  61  rows  8  cols 
  -> target variable   :  61  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.001396533 , mean =  0.4262295 , max =  0.999766  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.8829128 , mean =  1.05795e-15 , max =  0.860971  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.1 
  -> data              :  61  rows  8  cols 
  -> target variable   :  61  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.0002644932 , mean =  0.3278689 , max =  0.9819968  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.792514 , mean =  -6.220705e-15 , max =  0.8393045  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.1 
  -> data              :  61  rows  8  cols 
  -> target variable   :  61  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.007157103 , mean =  0.3442623 , max =  0.979401  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.7496494 , mean =  3.02266e-12 , max =  0.9041704  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.3 
  -> data              :  184  rows  8  cols 
  -> target variable   :  184  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.0003898785 , mean =  0.3695652 , max =  0.9901829  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9616683 , mean =  -8.622124e-12 , max =  0.9118897  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.3 
  -> data              :  184  rows  8  cols 
  -> target variable   :  184  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.009355782 , mean =  0.3315217 , max =  0.9668781  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.7974013 , mean =  -5.942979e-14 , max =  0.9745445  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.3 
  -> data              :  184  rows  8  cols 
  -> target variable   :  184  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.0009809545 , mean =  0.326087 , max =  0.9856423  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.8987656 , mean =  -1.163467e-11 , max =  0.8936537  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.3 
  -> data              :  184  rows  8  cols 
  -> target variable   :  184  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.0009854849 , mean =  0.3315217 , max =  0.9882061  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9552123 , mean =  -2.664226e-11 , max =  0.8874289  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.3 
  -> data              :  184  rows  8  cols 
  -> target variable   :  184  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.009806256 , mean =  0.3315217 , max =  0.9836747  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.7999698 , mean =  -2.986335e-14 , max =  0.9901937  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.5 
  -> data              :  307  rows  8  cols 
  -> target variable   :  307  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.001861421 , mean =  0.3485342 , max =  0.9941528  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9486772 , mean =  -2.221301e-14 , max =  0.9856247  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.5 
  -> data              :  307  rows  8  cols 
  -> target variable   :  307  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.003849884 , mean =  0.3192182 , max =  0.938876  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.8498261 , mean =  -5.904496e-16 , max =  0.9838009  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.5 
  -> data              :  307  rows  8  cols 
  -> target variable   :  307  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.000849326 , mean =  0.3420195 , max =  0.9861281  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9336583 , mean =  -2.586682e-13 , max =  0.9683198  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.5 
  -> data              :  307  rows  8  cols 
  -> target variable   :  307  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.002316585 , mean =  0.3745928 , max =  0.9909388  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9408385 , mean =  -9.267045e-15 , max =  0.9686949  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_0.5 
  -> data              :  307  rows  8  cols 
  -> target variable   :  307  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.001331084 , mean =  0.3289902 , max =  0.9995719  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9344751 , mean =  -5.301371e-11 , max =  0.995086  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_1 
  -> data              :  615  rows  8  cols 
  -> target variable   :  615  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.001416797 , mean =  0.3495935 , max =  0.9943172  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9518769 , mean =  -4.522626e-14 , max =  0.989687  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_1 
  -> data              :  615  rows  8  cols 
  -> target variable   :  615  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.001416797 , mean =  0.3495935 , max =  0.9943172  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9518769 , mean =  -4.522626e-14 , max =  0.989687  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_1 
  -> data              :  615  rows  8  cols 
  -> target variable   :  615  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.001416797 , mean =  0.3495935 , max =  0.9943172  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9518769 , mean =  -4.522626e-14 , max =  0.989687  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_1 
  -> data              :  615  rows  8  cols 
  -> target variable   :  615  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.001416797 , mean =  0.3495935 , max =  0.9943172  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9518769 , mean =  -4.522626e-14 , max =  0.989687  
  A new explainer has been created!  
Preparation of a new explainer is initiated
  -> model label       :  glm_size_1 
  -> data              :  615  rows  8  cols 
  -> target variable   :  615  values 
  -> predict function  :  yhat.glm  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package stats , ver. 4.4.2 , task classification (  default  ) 
  -> predicted values  :  numerical, min =  0.001416797 , mean =  0.3495935 , max =  0.9943172  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  -0.9518769 , mean =  -4.522626e-14 , max =  0.989687  
  A new explainer has been created!

Code

size_df <- bind_rows(size_results) %>%
  filter(!is.na(variable), variable != "intercept")

robust_summary <- size_df %>%
  group_by(size, variable) %>%
  summarise(sd_contribution = sd(contribution, na.rm = TRUE), .groups = 'drop')

ggplot(robust_summary, aes(x = variable, y = sd_contribution, fill = size)) +
  geom_col(position = position_dodge()) +
  labs(title = "LIME Contribution Variability across Training Sizes",
       x = "Variable", y = "Standard Deviation of Contribution") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

3.4 Change Training Dataset Size Analysis - Logistic Regression

Overall trend:

As the training size increases, the standard deviation (SD) of most feature contributions tends to decrease, indicating improved explanation stability. In general, red bars (10%) are taller than green (30%), blue (50%), and purple (100%), especially for features like glucose. Some features remain low throughout, but most show reduced SD with more data.

However, for the feature prediction, the SD remains consistently high across all training sizes, suggesting inherent instability in interpreting the predicted outcome itself.

High-variability pairs:

Among all variables, glucose = 166 shows the highest standard deviation at small training sizes, suggesting that its LIME contribution is highly sensitive to training data fluctuations. This variability decreases as training size increases, indicating improved robustness.

The item prediction, which represents the explanation of the predicted class itself, consistently exhibits the highest SD across all training sizes.

Stable feature-value pairs:

Pairs like pressure = 66, pressure = 72, triceps = 29, triceps = 32, and insulin = 88 maintain low variability throughout, indicating strong robustness in LIME’s local explanations for these values.

LIME explanations become more reliable with larger training sets. When using small training sizes (especially <30%), explanations of some feature values may be highly volatile and potentially misleading. For critical medical features like glucose and BMI, we recommend using at least 50% of the data to ensure stable and trustworthy interpretations.