Random Forest Practice
Learning objectives
- Load and explore geochemical data for classification
- Train a Random Forest classifier using scikit-learn
- Extract and visualize feature importance
- Tune hyperparameters: n_estimators, max_depth, max_features, min_samples_split
- Use out-of-bag score as a validation estimate
- Compare Random Forest to a single decision tree
- Perform cross-validation with Random Forest
Setting Up a Random Forest in Python
Scikit-learn provides RandomForestClassifier and RandomForestRegressor in the sklearn.ensemble module. The API is identical to other scikit-learn estimators: fit(), predict(), score().
Step 1: Loading and Exploring Geochemical Data
Creating a Geochemical Dataset
We simulate well-log lithofacies data with four features measured from wireline logs. In real projects you would load this from a CSV file.
%%%python import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, classification_report
Simulated well-log data for 3 lithofacies
np.random.seed(42) n = 200 GR = np.concatenate([np.random.normal(40, 10, 80), # sandstone: low GR np.random.normal(120, 15, 70), # shale: high GR np.random.normal(20, 8, 50)]) # limestone: very low GR RHOB = np.concatenate([np.random.normal(2.35, 0.05, 80), np.random.normal(2.55, 0.08, 70), np.random.normal(2.71, 0.04, 50)]) NPHI = np.concatenate([np.random.normal(0.18, 0.03, 80), np.random.normal(0.35, 0.05, 70), np.random.normal(0.02, 0.02, 50)]) RT = np.concatenate([np.random.normal(15, 5, 80), np.random.normal(5, 2, 70), np.random.normal(200, 50, 50)]) y = np.array([0]*80 + [1]*70 + [2]*50)
df = pd.DataFrame({"GR": GR, "RHOB": RHOB, "NPHI": NPHI, "RT": RT, "litho": y}) print(df.head(10)) print("\nClass distribution:") print(df["litho"].value_counts().rename({0: "Sandstone", 1: "Shale", 2: "Limestone"})) print("\nBasic statistics:\n", df.describe()) %%%
Visualizing Feature Distributions by Class
%%%python fig, axes = plt.subplots(2, 2, figsize=(10, 8)) feature_names = ["GR", "RHOB", "NPHI", "RT"] litho_names = {0: "Sandstone", 1: "Shale", 2: "Limestone"} colors = ["#2196F3", "#F44336", "#4CAF50"]
for ax, feat in zip(axes.flatten(), feature_names): for cls in [0, 1, 2]: ax.hist(df[df["litho"]==cls][feat], bins=20, alpha=0.5, label=litho_names[cls], color=colors[cls]) ax.set_xlabel(feat) ax.set_ylabel("Count") ax.legend(fontsize=8)
plt.suptitle("Feature Distributions by Lithofacies") plt.tight_layout() plt.show() %%%
Step 2: Train a Random Forest Classifier
Basic Training with n_estimators
The most important parameter is n_estimators — the number of trees in the forest. More trees generally improve accuracy but increase computation time.
%%%python X = df[["GR", "RHOB", "NPHI", "RT"]].values y = df["litho"].values
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 )
Train with 100 trees
rf = RandomForestClassifier(n_estimators=100, random_state=42) rf.fit(X_train, y_train) y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred)) print("\nClassification Report:") print(classification_report(y_test, y_pred, target_names=["Sandstone", "Shale", "Limestone"])) %%%
Step 3: Feature Importance Bar Chart
Visualizing Which Features Matter Most
The .feature_importances_ attribute gives the Mean Decrease in Impurity (MDI) for each feature.
%%%python feature_names = ["GR", "RHOB", "NPHI", "RT"] importances = rf.feature_importances_ idx = np.argsort(importances)[::-1] # sort descending
plt.figure(figsize=(8, 5)) bars = plt.bar(range(len(importances)), importances[idx], color="steelblue") plt.xticks(range(len(importances)), [feature_names[i] for i in idx]) plt.ylabel("Feature Importance (MDI)") plt.title("Random Forest Feature Importance — Well-Log Data")
Annotate bars with values
for bar, imp in zip(bars, importances[idx]): plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005, f"{imp:.3f}", ha="center", fontsize=10) plt.tight_layout() plt.show()
Print ranked importances
print("Feature importance ranking:") for i in idx: print(f" {feature_names[i]:6s}: {importances[i]:.4f}") %%%
Step 4: Out-of-Bag (OOB) Score
Using OOB as a Free Validation Estimate
Each tree is trained on a bootstrap sample (~63.2% of training data). The remaining ~36.8% are called out-of-bag samples. The OOB score predicts each sample using only the trees that did NOT train on it — a built-in validation estimate without needing a separate test set.
%%%python rf_oob = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42) rf_oob.fit(X_train, y_train)
print(f"OOB Accuracy: {rf_oob.oob_score_:.4f}") print(f"Test Accuracy: {rf_oob.score(X_test, y_test):.4f}") print("\nOOB and test accuracy should be similar.")
OOB class probabilities for first 5 training samples
print("\nOOB class probabilities (first 5 samples):") print(" [Sandstone, Shale, Limestone]") for i in range(5): probs = rf_oob.oob_decision_function_[i] print(f" Sample {i}: [{probs[0]:.3f}, {probs[1]:.3f}, {probs[2]:.3f}]") %%%
Step 5: Comparing n_estimators (10, 50, 100, 200)
How Many Trees Do We Need?
We plot OOB error vs. number of trees to find the sweet spot where adding more trees stops helping.
%%%python n_trees_list = [1, 5, 10, 25, 50, 100, 150, 200, 300, 500] oob_scores = [] test_scores = []
for n in n_trees_list: rf_temp = RandomForestClassifier(n_estimators=n, oob_score=True, random_state=42) rf_temp.fit(X_train, y_train) oob_scores.append(rf_temp.oob_score_) test_scores.append(rf_temp.score(X_test, y_test))
plt.figure(figsize=(9, 5)) plt.plot(n_trees_list, oob_scores, "o-", color="darkorange", label="OOB Accuracy") plt.plot(n_trees_list, test_scores, "s--", color="steelblue", label="Test Accuracy") plt.xlabel("Number of Trees (n_estimators)") plt.ylabel("Accuracy") plt.title("Accuracy vs. Number of Trees") plt.legend() plt.grid(True, alpha=0.3) plt.tight_layout() plt.show()
Print the table
print("n_trees OOB Test") for n, oob, test in zip(n_trees_list, oob_scores, test_scores): print(f" {n:>5d} {oob:.4f} {test:.4f}") print("\nAccuracy typically plateaus around 100-200 trees.") %%%
Step 6: Random Forest vs. Single Decision Tree
Side-by-Side Comparison
A single decision tree tends to overfit (high training accuracy, lower test accuracy). The Random Forest averages many trees, reducing variance and improving generalization.
%%%python from sklearn.tree import DecisionTreeClassifier
Single decision tree
dt = DecisionTreeClassifier(random_state=42) dt.fit(X_train, y_train)
dt_train = dt.score(X_train, y_train) dt_test = dt.score(X_test, y_test) rf_train = rf.score(X_train, y_train) rf_test = rf.score(X_test, y_test)
print("=" * 45) print("Model Train Test") print("-" * 45) print(f"Decision Tree {dt_train:.4f} {dt_test:.4f}") print(f"Random Forest (100) {rf_train:.4f} {rf_test:.4f}") print("=" * 45) print(f"RF improvement on test: {rf_test - dt_test:+.4f}")
Repeat with 20 different random splits to show stability
dt_scores = [] rf_scores = [] for i in range(20): Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, random_state=i) dt.fit(Xtr, ytr) dt_scores.append(dt.score(Xte, yte)) rf_temp = RandomForestClassifier(n_estimators=100, random_state=i) rf_temp.fit(Xtr, ytr) rf_scores.append(rf_temp.score(Xte, yte))
print(f"\nOver 20 random splits:") print(f" DT mean: {np.mean(dt_scores):.4f} +/- {np.std(dt_scores):.4f}") print(f" RF mean: {np.mean(rf_scores):.4f} +/- {np.std(rf_scores):.4f}") print("RF is typically more accurate AND more stable (lower std).") %%%
Step 7: Cross-Validation with Random Forest
k-Fold Cross-Validation
Cross-validation gives a more robust accuracy estimate by using every sample for both training and testing across folds.
%%%python from sklearn.model_selection import cross_val_score, StratifiedKFold
5-fold stratified cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) rf_cv = RandomForestClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(rf_cv, X, y, cv=cv, scoring="accuracy")
print("5-Fold Cross-Validation Results:") for i, score in enumerate(scores): print(f" Fold {i+1}: {score:.4f}") print(f"\n Mean: {scores.mean():.4f}") print(f" Std: {scores.std():.4f}") print("\nStratified CV preserves class proportions in each fold.") %%%
Step 8: Hyperparameter Tuning
Exploring max_depth, max_features, and min_samples_split
Beyond n_estimators, these parameters control tree complexity and can prevent overfitting.
%%%python from sklearn.model_selection import GridSearchCV
param_grid = { "n_estimators": [50, 100, 200], "max_depth": [None, 5, 10, 20], "max_features": ["sqrt", "log2", None], "min_samples_split": [2, 5, 10] }
Use a smaller grid for speed in this demo
small_grid = { "n_estimators": [50, 100], "max_depth": [None, 10], "min_samples_split": [2, 5] }
grid_search = GridSearchCV( RandomForestClassifier(random_state=42), small_grid, cv=3, scoring="accuracy", n_jobs=-1, verbose=0 ) grid_search.fit(X_train, y_train)
print("Best parameters:", grid_search.best_params_) print(f"Best CV accuracy: {grid_search.best_score_:.4f}") print(f"Test accuracy: {grid_search.score(X_test, y_test):.4f}")
Show top 5 parameter combinations
results = pd.DataFrame(grid_search.cv_results_) results = results.sort_values("rank_test_score") print("\nTop 5 parameter combinations:") for _, row in results.head(5).iterrows(): print(f" Rank {int(row.rank_test_score)}: {row.params} -> {row.mean_test_score:.4f}") %%%
Effect of max_depth on Overfitting
%%%python depths = [2, 3, 5, 10, 20, None] train_accs = [] test_accs = []
for d in depths: rf_d = RandomForestClassifier(n_estimators=100, max_depth=d, random_state=42) rf_d.fit(X_train, y_train) train_accs.append(rf_d.score(X_train, y_train)) test_accs.append(rf_d.score(X_test, y_test))
labels = [str(d) if d else "None" for d in depths] x_pos = range(len(depths))
plt.figure(figsize=(8, 5)) plt.plot(x_pos, train_accs, "o-", label="Train", color="steelblue") plt.plot(x_pos, test_accs, "s--", label="Test", color="darkorange") plt.xticks(x_pos, labels) plt.xlabel("max_depth") plt.ylabel("Accuracy") plt.title("Effect of max_depth on RF Performance") plt.legend() plt.grid(True, alpha=0.3) plt.tight_layout() plt.show()
print("Shallow trees (max_depth=2-3) underfit.") print("Unlimited depth may slightly overfit but RF is robust.") %%%
[Library: scikit-learn]
References
- Pedregosa, F., et al. (2011). Scikit-learn: Machine learning in Python. J. Mach. Learn. Res. 12, 2825–2830.
- Géron, A. (2022). Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow (3rd ed.), ch. 7 (ensemble learning and random forests). O’Reilly.
- James, G., Witten, D., Hastie, T., Tibshirani, R. (2021). An Introduction to Statistical Learning (2nd ed.), ch. 8 (lab: bagging and random forests). Springer.