Random Forest Practice

Chapter 8: Tree-Based Models II — Random Forests

Learning objectives

Load and explore geochemical data for classification
Train a Random Forest classifier using scikit-learn
Extract and visualize feature importance
Tune hyperparameters: n_estimators, max_depth, max_features, min_samples_split
Use out-of-bag score as a validation estimate
Compare Random Forest to a single decision tree
Perform cross-validation with Random Forest

Setting Up a Random Forest in Python

Scikit-learn provides RandomForestClassifier and RandomForestRegressor in the sklearn.ensemble module. The API is identical to other scikit-learn estimators: fit(), predict(), score().

Step 1: Loading and Exploring Geochemical Data

Creating a Geochemical Dataset

We simulate well-log lithofacies data with four features measured from wireline logs. In real projects you would load this from a CSV file.

%%%python import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report

Simulated well-log data for 3 lithofacies

np.random.seed(42) n = 200 GR = np.concatenate([np.random.normal(40, 10, 80), # sandstone: low GR np.random.normal(120, 15, 70), # shale: high GR np.random.normal(20, 8, 50)]) # limestone: very low GR RHOB = np.concatenate([np.random.normal(2.35, 0.05, 80), np.random.normal(2.55, 0.08, 70), np.random.normal(2.71, 0.04, 50)]) NPHI = np.concatenate([np.random.normal(0.18, 0.03, 80), np.random.normal(0.35, 0.05, 70), np.random.normal(0.02, 0.02, 50)]) RT = np.concatenate([np.random.normal(15, 5, 80), np.random.normal(5, 2, 70), np.random.normal(200, 50, 50)]) y = np.array([0]*80 + [1]*70 + [2]*50)

df = pd.DataFrame({"GR": GR, "RHOB": RHOB, "NPHI": NPHI, "RT": RT, "litho": y}) print(df.head(10)) print("\nClass distribution:") print(df["litho"].value_counts().rename({0: "Sandstone", 1: "Shale", 2: "Limestone"})) print("\nBasic statistics:\n", df.describe()) %%%

Visualizing Feature Distributions by Class

%%%python fig, axes = plt.subplots(2, 2, figsize=(10, 8)) feature_names = ["GR", "RHOB", "NPHI", "RT"] litho_names = {0: "Sandstone", 1: "Shale", 2: "Limestone"} colors = ["#2196F3", "#F44336", "#4CAF50"]

for ax, feat in zip(axes.flatten(), feature_names): for cls in [0, 1, 2]: ax.hist(df[df["litho"]==cls][feat], bins=20, alpha=0.5, label=litho_names[cls], color=colors[cls]) ax.set_xlabel(feat) ax.set_ylabel("Count") ax.legend(fontsize=8)

plt.suptitle("Feature Distributions by Lithofacies") plt.tight_layout() plt.show() %%%

Step 2: Train a Random Forest Classifier

Basic Training with n_estimators

The most important parameter is n_estimators — the number of trees in the forest. More trees generally improve accuracy but increase computation time.

%%%python X = df[["GR", "RHOB", "NPHI", "RT"]].values y = df["litho"].values

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 )

Train with 100 trees

rf = RandomForestClassifier(n_estimators=100, random_state=42) rf.fit(X_train, y_train) y_pred = rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred)) print("\nClassification Report:") print(classification_report(y_test, y_pred, target_names=["Sandstone", "Shale", "Limestone"])) %%%

Step 3: Feature Importance Bar Chart

Visualizing Which Features Matter Most

The .feature_importances_ attribute gives the Mean Decrease in Impurity (MDI) for each feature.

%%%python feature_names = ["GR", "RHOB", "NPHI", "RT"] importances = rf.feature_importances_ idx = np.argsort(importances)[::-1] # sort descending

plt.figure(figsize=(8, 5)) bars = plt.bar(range(len(importances)), importances[idx], color="steelblue") plt.xticks(range(len(importances)), [feature_names[i] for i in idx]) plt.ylabel("Feature Importance (MDI)") plt.title("Random Forest Feature Importance — Well-Log Data")

Annotate bars with values

for bar, imp in zip(bars, importances[idx]): plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, f"{imp:.3f}", ha="center", fontsize=10) plt.tight_layout() plt.show()

Print ranked importances

print("Feature importance ranking:") for i in idx: print(f" {feature_names[i]:6s}: {importances[i]:.4f}") %%%

Step 4: Out-of-Bag (OOB) Score

Using OOB as a Free Validation Estimate

Each tree is trained on a bootstrap sample (~63.2% of training data). The remaining ~36.8% are called out-of-bag samples. The OOB score predicts each sample using only the trees that did NOT train on it — a built-in validation estimate without needing a separate test set.

%%%python rf_oob = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42) rf_oob.fit(X_train, y_train)

print(f"OOB Accuracy: {rf_oob.oob_score_:.4f}") print(f"Test Accuracy: {rf_oob.score(X_test, y_test):.4f}") print("\nOOB and test accuracy should be similar.")

OOB class probabilities for first 5 training samples

print("\nOOB class probabilities (first 5 samples):") print(" [Sandstone, Shale, Limestone]") for i in range(5): probs = rf_oob.oob_decision_function_[i] print(f" Sample {i}: [{probs[0]:.3f}, {probs[1]:.3f}, {probs[2]:.3f}]") %%%

Step 5: Comparing n_estimators (10, 50, 100, 200)

How Many Trees Do We Need?

We plot OOB error vs. number of trees to find the sweet spot where adding more trees stops helping.

%%%python n_trees_list = [1, 5, 10, 25, 50, 100, 150, 200, 300, 500] oob_scores = [] test_scores = []

for n in n_trees_list: rf_temp = RandomForestClassifier(n_estimators=n, oob_score=True, random_state=42) rf_temp.fit(X_train, y_train) oob_scores.append(rf_temp.oob_score_) test_scores.append(rf_temp.score(X_test, y_test))

plt.figure(figsize=(9, 5)) plt.plot(n_trees_list, oob_scores, "o-", color="darkorange", label="OOB Accuracy") plt.plot(n_trees_list, test_scores, "s--", color="steelblue", label="Test Accuracy") plt.xlabel("Number of Trees (n_estimators)") plt.ylabel("Accuracy") plt.title("Accuracy vs. Number of Trees") plt.legend() plt.grid(True, alpha=0.3) plt.tight_layout() plt.show()

Print the table

print("n_trees OOB Test") for n, oob, test in zip(n_trees_list, oob_scores, test_scores): print(f" {n:>5d} {oob:.4f} {test:.4f}") print("\nAccuracy typically plateaus around 100-200 trees.") %%%

Step 6: Random Forest vs. Single Decision Tree

Side-by-Side Comparison

A single decision tree tends to overfit (high training accuracy, lower test accuracy). The Random Forest averages many trees, reducing variance and improving generalization.

%%%python from sklearn.tree import DecisionTreeClassifier

Single decision tree

dt = DecisionTreeClassifier(random_state=42) dt.fit(X_train, y_train)

dt_train = dt.score(X_train, y_train) dt_test = dt.score(X_test, y_test) rf_train = rf.score(X_train, y_train) rf_test = rf.score(X_test, y_test)

print("=" * 45) print("Model Train Test") print("-" * 45) print(f"Decision Tree {dt_train:.4f} {dt_test:.4f}") print(f"Random Forest (100) {rf_train:.4f} {rf_test:.4f}") print("=" * 45) print(f"RF improvement on test: {rf_test - dt_test:+.4f}")

Repeat with 20 different random splits to show stability

dt_scores = [] rf_scores = [] for i in range(20): Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=i) dt.fit(Xtr, ytr) dt_scores.append(dt.score(Xte, yte)) rf_temp = RandomForestClassifier(n_estimators=100, random_state=i) rf_temp.fit(Xtr, ytr) rf_scores.append(rf_temp.score(Xte, yte))

print(f"\nOver 20 random splits:") print(f" DT mean: {np.mean(dt_scores):.4f} +/- {np.std(dt_scores):.4f}") print(f" RF mean: {np.mean(rf_scores):.4f} +/- {np.std(rf_scores):.4f}") print("RF is typically more accurate AND more stable (lower std).") %%%

Step 7: Cross-Validation with Random Forest

k-Fold Cross-Validation

Cross-validation gives a more robust accuracy estimate by using every sample for both training and testing across $k$ folds.

%%%python from sklearn.model_selection import cross_val_score, StratifiedKFold

5-fold stratified cross-validation

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) rf_cv = RandomForestClassifier(n_estimators=100, random_state=42)

scores = cross_val_score(rf_cv, X, y, cv=cv, scoring="accuracy")

print("5-Fold Cross-Validation Results:") for i, score in enumerate(scores): print(f" Fold {i+1}: {score:.4f}") print(f"\n Mean: {scores.mean():.4f}") print(f" Std: {scores.std():.4f}") print("\nStratified CV preserves class proportions in each fold.") %%%

Step 8: Hyperparameter Tuning

Exploring max_depth, max_features, and min_samples_split

Beyond n_estimators, these parameters control tree complexity and can prevent overfitting.

%%%python from sklearn.model_selection import GridSearchCV

param_grid = { "n_estimators": [50, 100, 200], "max_depth": [None, 5, 10, 20], "max_features": ["sqrt", "log2", None], "min_samples_split": [2, 5, 10] }

Use a smaller grid for speed in this demo

small_grid = { "n_estimators": [50, 100], "max_depth": [None, 10], "min_samples_split": [2, 5] }

grid_search = GridSearchCV( RandomForestClassifier(random_state=42), small_grid, cv=3, scoring="accuracy", n_jobs=-1, verbose=0 ) grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_) print(f"Best CV accuracy: {grid_search.best_score_:.4f}") print(f"Test accuracy: {grid_search.score(X_test, y_test):.4f}")

Show top 5 parameter combinations

results = pd.DataFrame(grid_search.cv_results_) results = results.sort_values("rank_test_score") print("\nTop 5 parameter combinations:") for _, row in results.head(5).iterrows(): print(f" Rank {int(row.rank_test_score)}: {row.params} -> {row.mean_test_score:.4f}") %%%

Effect of max_depth on Overfitting

%%%python depths = [2, 3, 5, 10, 20, None] train_accs = [] test_accs = []

for d in depths: rf_d = RandomForestClassifier(n_estimators=100, max_depth=d, random_state=42) rf_d.fit(X_train, y_train) train_accs.append(rf_d.score(X_train, y_train)) test_accs.append(rf_d.score(X_test, y_test))

labels = [str(d) if d else "None" for d in depths] x_pos = range(len(depths))

plt.figure(figsize=(8, 5)) plt.plot(x_pos, train_accs, "o-", label="Train", color="steelblue") plt.plot(x_pos, test_accs, "s--", label="Test", color="darkorange") plt.xticks(x_pos, labels) plt.xlabel("max_depth") plt.ylabel("Accuracy") plt.title("Effect of max_depth on RF Performance") plt.legend() plt.grid(True, alpha=0.3) plt.tight_layout() plt.show()

print("Shallow trees (max_depth=2-3) underfit.") print("Unlimited depth may slightly overfit but RF is robust.") %%%

[Library: scikit-learn]

References

Pedregosa, F., et al. (2011). Scikit-learn: Machine learning in Python. J. Mach. Learn. Res. 12, 2825–2830.
Géron, A. (2022). Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow (3rd ed.), ch. 7 (ensemble learning and random forests). O’Reilly.
James, G., Witten, D., Hastie, T., Tibshirani, R. (2021). An Introduction to Statistical Learning (2nd ed.), ch. 8 (lab: bagging and random forests). Springer.