KNN with Geoscience Data
Learning objectives
- Load and explore geoscience datasets for KNN classification
- Apply StandardScaler feature scaling before KNN training
- Implement KNN classification with train/test split
- Plot the accuracy-vs-K elbow curve to select the best K
- Evaluate with confusion matrix and classification report
- Compare distance metrics (Euclidean, Manhattan, Chebyshev)
- Perform cross-validation with cross_val_score
- Visualise KNN decision boundaries in 2D
KNN in Python — Complete Geoscience Workflow
scikit-learn provides KNeighborsClassifier for classification and KNeighborsRegressor for regression. In this lab, we build a complete KNN pipeline for classifying rock types from mineral composition data, covering every step from data loading to cross-validated model selection.
Step 1: Loading and Exploring the Data
Our dataset consists of rock samples analysed for mineral composition. Each sample has three features (quartz %, feldspar %, calcite %) and a rock-type label. In a real project, this data might come from thin-section point counting or XRD analysis.
%%%python import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix)
Rock classification from mineral composition (%)
np.random.seed(42) n_sand, n_ark, n_lime = 40, 35, 25
quartz = np.concatenate([ np.random.normal(80, 8, n_sand), np.random.normal(45, 7, n_ark), np.random.normal(8, 4, n_lime) ]) feldspar = np.concatenate([ np.random.normal(8, 4, n_sand), np.random.normal(38, 6, n_ark), np.random.normal(5, 3, n_lime) ]) calcite = np.concatenate([ np.random.normal(3, 2, n_sand), np.random.normal(7, 4, n_ark), np.random.normal(82, 8, n_lime) ]) quartz = np.clip(quartz, 0, 100) feldspar = np.clip(feldspar, 0, 100) calcite = np.clip(calcite, 0, 100)
X = np.column_stack([quartz, feldspar, calcite]) y = np.array([0]*n_sand + [1]*n_ark + [2]*n_lime) feature_names = ["Quartz %", "Feldspar %", "Calcite %"] class_names = ["Sandstone", "Arkose", "Limestone"]
print(f"Dataset: {X.shape[0]} samples, {X.shape[1]} features") print(f"Classes: {dict(zip(class_names, [n_sand, n_ark, n_lime]))}") %%%
Step 2: Data Exploration
Before modelling, explore the data visually to understand feature distributions and class separability:
%%%python
Feature histograms by class
fig, axes = plt.subplots(1, 3, figsize=(14, 4)) for i, (ax, fname) in enumerate(zip(axes, feature_names)): for cls, cname in enumerate(class_names): ax.hist(X[y == cls, i], bins=15, alpha=0.5, label=cname) ax.set_xlabel(fname) ax.set_ylabel("Count") ax.legend() plt.suptitle("Feature Distributions by Rock Type") plt.tight_layout() plt.show() %%%%%%python
Pairwise scatter plots
fig, axes = plt.subplots(1, 3, figsize=(15, 4)) pairs = [(0, 1), (0, 2), (1, 2)] for ax, (i, j) in zip(axes, pairs): for cls, cname in enumerate(class_names): mask = y == cls ax.scatter(X[mask, i], X[mask, j], alpha=0.6, label=cname, edgecolor="k", linewidth=0.5) ax.set_xlabel(feature_names[i]) ax.set_ylabel(feature_names[j]) ax.legend(fontsize=8) plt.suptitle("Pairwise Feature Scatter Plots") plt.tight_layout() plt.show() %%%
Step 3: Feature Scaling with StandardScaler
Feature scaling is critical for KNN. Without it, features with larger numeric ranges dominate the distance calculation.
%%%python
Train/test split (80/20, stratified)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f"Training: {X_train.shape[0]} samples") print(f"Test: {X_test.shape[0]} samples")
Scale features: fit on TRAINING data only
scaler = StandardScaler() X_train_s = scaler.fit_transform(X_train) X_test_s = scaler.transform(X_test)
print("\nBefore scaling — feature ranges:") for i, name in enumerate(feature_names): print(f" {name}: [{X_train[:, i].min():.1f}, " f"{X_train[:, i].max():.1f}]") print("\nAfter scaling — feature ranges:") for i, name in enumerate(feature_names): print(f" {name}: [{X_train_s[:, i].min():.2f}, " f"{X_train_s[:, i].max():.2f}]") %%%
Step 4: Basic KNN and Classification Report
%%%python knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train_s, y_train) y_pred = knn.predict(X_test_s)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}\n") print(classification_report(y_test, y_pred, target_names=class_names)) %%%
Step 5: Accuracy vs K (Elbow Curve)
Plot both train and test accuracy to visualise the bias-variance trade-off:
%%%python k_range = range(1, 21) train_acc, test_acc = [], [] for k in k_range: knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train_s, y_train) train_acc.append(knn.score(X_train_s, y_train)) test_acc.append(knn.score(X_test_s, y_test))
plt.figure(figsize=(9, 5)) plt.plot(k_range, train_acc, "o-", label="Train", linewidth=2) plt.plot(k_range, test_acc, "s-", label="Test", linewidth=2) plt.xlabel("K (number of neighbors)") plt.ylabel("Accuracy") plt.title("KNN: Accuracy vs K (Elbow Curve)") plt.legend() plt.xticks(list(k_range)) plt.grid(alpha=0.3) plt.tight_layout() plt.show()
best_k = list(k_range)[np.argmax(test_acc)] print(f"Best K = {best_k} (test acc = {max(test_acc):.3f})") %%%
K=1 has perfect training accuracy but may overfit. Look for the K where test accuracy peaks or plateaus.
Step 6: Confusion Matrix
%%%python knn_best = KNeighborsClassifier(n_neighbors=best_k) knn_best.fit(X_train_s, y_train) y_pred_best = knn_best.predict(X_test_s)
cm = confusion_matrix(y_test, y_pred_best) fig, ax = plt.subplots(figsize=(6, 5)) im = ax.imshow(cm, cmap="Blues") plt.colorbar(im) ax.set_xticks([0, 1, 2]) ax.set_xticklabels(class_names) ax.set_yticks([0, 1, 2]) ax.set_yticklabels(class_names) ax.set_xlabel("Predicted") ax.set_ylabel("Actual") ax.set_title(f"KNN Confusion Matrix (K={best_k})") for i in range(3): for j in range(3): color = "white" if cm[i, j] > cm.max() / 2 else "black" ax.text(j, i, str(cm[i, j]), ha="center", va="center", color=color, fontsize=14) plt.tight_layout() plt.show() %%%
Step 7: Comparing Distance Metrics
%%%python metrics = [ ("Euclidean", {"metric": "euclidean"}), ("Manhattan", {"metric": "manhattan"}), ("Minkowski p=3", {"metric": "minkowski", "p": 3}), ("Chebyshev", {"metric": "chebyshev"}), ] for name, params in metrics: knn = KNeighborsClassifier(n_neighbors=5, **params) knn.fit(X_train_s, y_train) tr = knn.score(X_train_s, y_train) te = knn.score(X_test_s, y_test) print(f"{name:<18} train={tr:.3f} test={te:.3f}") %%%
For mineral composition data, Euclidean and Manhattan typically perform similarly. Chebyshev focuses on the single largest feature difference.
Step 8: Cross-Validation for Robust K Selection
A single train/test split can give unreliable estimates. Cross-validation averages over multiple splits:
%%%python from sklearn.model_selection import cross_val_score
k_range = range(1, 21) cv_means, cv_stds = [], [] for k in k_range: knn = KNeighborsClassifier(n_neighbors=k) scores = cross_val_score(knn, X_train_s, y_train, cv=5, scoring="accuracy") cv_means.append(scores.mean()) cv_stds.append(scores.std())
cv_means = np.array(cv_means) cv_stds = np.array(cv_stds)
plt.figure(figsize=(9, 5)) plt.plot(k_range, cv_means, "o-", linewidth=2) plt.fill_between(k_range, cv_means - cv_stds, cv_means + cv_stds, alpha=0.2) plt.xlabel("K") plt.ylabel("Cross-Validation Accuracy") plt.title("5-Fold CV Accuracy vs K") plt.xticks(list(k_range)) plt.grid(alpha=0.3) plt.tight_layout() plt.show()
best_k_cv = list(k_range)[np.argmax(cv_means)] print(f"Best K by CV = {best_k_cv} " f"(mean acc = {cv_means.max():.3f} " f"+/- {cv_stds[np.argmax(cv_means)]:.3f})") %%%
The shaded band shows standard deviation across folds. Choose K with highest mean and narrow band.
Step 9: Distance-Weighted KNN
%%%python for w in ["uniform", "distance"]: knn = KNeighborsClassifier(n_neighbors=5, weights=w) scores = cross_val_score(knn, X_train_s, y_train, cv=5) print(f"weights={w:>8}: CV acc = {scores.mean():.3f} " f"+/- {scores.std():.3f}") %%%
Distance weighting gives more influence to closer neighbours, often improving results.
Step 10: Decision Boundaries for Different K
%%%python X2_train = X_train_s[:, :2] # quartz and feldspar only fig, axes = plt.subplots(1, 3, figsize=(16, 5)) for ax, k in zip(axes, [1, 5, 15]): knn2d = KNeighborsClassifier(n_neighbors=k) knn2d.fit(X2_train, y_train) x_min = X2_train[:, 0].min() - 1 x_max = X2_train[:, 0].max() + 1 y_min = X2_train[:, 1].min() - 1 y_max = X2_train[:, 1].max() + 1 xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200)) Z = knn2d.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) ax.contourf(xx, yy, Z, alpha=0.3, cmap="Set3") ax.scatter(X2_train[:, 0], X2_train[:, 1], c=y_train, cmap="Set1", edgecolor="k") ax.set_xlabel("Quartz (scaled)") ax.set_ylabel("Feldspar (scaled)") ax.set_title(f"K = {k}") plt.suptitle("KNN Decision Boundaries") plt.tight_layout() plt.show() %%%
K=1 creates jagged boundaries (overfitting); K=15 creates smooth boundaries. The optimal K balances these.
Step 11: Predicting New Samples and Inspecting Neighbours
%%%python
Classify a new rock sample
new_sample = np.array([[60, 25, 8]]) new_scaled = scaler.transform(new_sample)
prediction = knn_best.predict(new_scaled) proba = knn_best.predict_proba(new_scaled)
print(f"Predicted: {class_names[prediction[0]]}") for name, p in zip(class_names, proba[0]): print(f" {name}: {p:.3f}")
Inspect the actual nearest neighbors
dists, idxs = knn_best.kneighbors(new_scaled) print(f"\nNearest {best_k} neighbors:") for d, idx in zip(dists[0], idxs[0]): print(f" Sample {idx} ({class_names[y_train[idx]]}), " f"dist = {d:.3f}") %%%
[Library: scikit-learn KNeighborsClassifier]
References
- Pedregosa, F., et al. (2011). Scikit-learn: Machine learning in Python. J. Mach. Learn. Res. 12, 2825–2830.
- Géron, A. (2022). Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow (3rd ed.), ch. 2 & 3 (end-to-end ML project, classification). O’Reilly.
- Hastie, T., Tibshirani, R., Friedman, J. (2009). The Elements of Statistical Learning (2nd ed.), ch. 13. Springer.