# Remember to load the wine dataset again if you need to
from sklearn.datasets import load_wine
= load_wine()
wine_data = wine_data.data
X = wine_data.target
y = wine_data.feature_names
feature_names = wine_data.target_names
target_names
import numpy as np
# Add noise: Gaussian noise (mean=0, std=feature_std * noise_factor)
= 0.3
noise_factor = np.random.RandomState(42)
rng = X + noise_factor * rng.normal(size=X.shape)
X_noisy
# Add irrelevant random features
= rng.normal(size=(X.shape[0], 5)) # add 5 random features
random_features = np.hstack([X, random_features])
X_noisy
from sklearn.model_selection import train_test_split
# Split into train and test sets
= train_test_split(X_noisy, y, test_size=0.2, random_state=42) X_train, X_test, y_train, y_test
What is a Random Forest?
What is Random Forest?
A Random Forest is an ensemble method that builds many Decision Trees and combines their results to make more accurate and stable predictions.
Independent Decision Trees:
Each tree in a Random Forest is trained independently with the following process:
Random Data Subset : Each tree is trained on a random subset of the training data — this is known as Bootstrap Aggregating (or Bagging).
Random Feature Subset: When creating splits in each tree, only a random subset of features is considered for each split. This adds further diversity to the trees.
No explicit depth control: By default, Random Forest doesn’t limit the depth of each tree. Therefore, trees will grow to their natural depth based on the data they’re given.
Voting for prediction
Since each tree is grown independently and the final prediction is made by aggregating (voting or averaging) the results of all the trees, the overall ensemble effect reduces overfitting:
Even if one or two trees are overfitted (i.e., they grow too deep), the ensemble average (or majority vote) will smooth out their influence on the final prediction. This is why Random Forests are more robust to overfitting compared to a single decision tree, which can easily memorize the training data if grown too deep.
Example
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Train Random Forest on the noisy dataset
= RandomForestClassifier(n_estimators=100, max_depth=None, random_state=30)
rf_clf
rf_clf.fit(X_train, y_train)
# Predict on training and testing data
= rf_clf.predict(X_train)
rf_train_pred = rf_clf.predict(X_test)
rf_test_pred
# Accuracy scores
= accuracy_score(y_train, rf_train_pred)
rf_train_acc = accuracy_score(y_test, rf_test_pred)
rf_test_acc
print(f"Random Forest Training Accuracy: {rf_train_acc:.2f}")
print(f"Random Forest Testing Accuracy: {rf_test_acc:.2f}")
Random Forest Training Accuracy: 1.00
Random Forest Testing Accuracy: 1.00
## NB This block of code repeats the DecisionTreeClassifier
## we did before, you don't need to run it again if
## you are using the same notebook
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# Train Decision Trees with different depths and record accuracy
= range(1, 10)
max_depth_range = []
train_accuracies = []
test_accuracies
for depth in max_depth_range:
= DecisionTreeClassifier(max_depth=depth, random_state=42)
clf
clf.fit(X_train, y_train)
# Predict on training and testing data
= clf.predict(X_train)
train_pred = clf.predict(X_test)
test_pred
# Accuracy scores
= accuracy_score(y_train, train_pred)
train_acc = accuracy_score(y_test, test_pred)
test_acc
train_accuracies.append(train_acc) test_accuracies.append(test_acc)
import matplotlib.pyplot as plt
# Add Random Forest horizontal lines for comparison
=(8, 5))
plt.figure(figsize='Decision Tree Training Accuracy', marker='o')
plt.plot(max_depth_range, train_accuracies, label='Decision Tree Testing Accuracy', marker='s')
plt.plot(max_depth_range, test_accuracies, label=rf_train_acc, color='green', linestyle='dashed', label='Random Forest Training Accuracy')
plt.axhline(y=rf_test_acc, color='orange', linestyle='dashdot', label='Random Forest Testing Accuracy')
plt.axhline(y'Max Depth of Decision Tree')
plt.xlabel('Accuracy')
plt.ylabel('Decision Tree vs Random Forest Accuracy')
plt.title(
plt.legend()True)
plt.grid( plt.show()
= rf_clf.feature_importances_
importances = np.argsort(importances)[::-1]
indices = list(feature_names) + [f'Random{i}' for i in range(1, 6)]
feature_names
=(12, 6))
plt.figure(figsize"Feature Importances from Random Forest")
plt.title(range(len(importances)), importances[indices], align="center")
plt.bar(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
plt.xticks(
plt.tight_layout() plt.show()
Further information about feature importance in scikit-learn https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html.