Scikit-learn Guide for Beginners



Scikit-learn Guide for Beginners

The Complete ML Library Tutorial for Engineering Students

Last Updated: March 2026

📌 Key Takeaways

  • What it is: Scikit-learn is Python’s go-to ML library — consistent API, 50+ algorithms, preprocessing, model selection, all in one package.
  • Core API: Every estimator follows fit() → predict()/transform() pattern.
  • Pipeline: Chain preprocessing and models — prevents data leakage, simplifies code.
  • Model selection: cross_val_score(), GridSearchCV(), RandomizedSearchCV().
  • Install: pip install scikit-learn
  • Best for: Classical ML (not deep learning — use PyTorch/TensorFlow for that).

1. The Estimator API — fit, predict, transform

Scikit-learn’s greatest strength is its consistent API. Every algorithm — from Linear Regression to Random Forest to K-Means — follows the same pattern:

MethodWhat it doesReturns
fit(X, y)Train the model on data X with labels yself (the trained estimator)
predict(X)Make predictions on new data XArray of predictions
transform(X)Transform data X (preprocessors only)Transformed array
fit_transform(X)fit() then transform() — convenience methodTransformed array
predict_proba(X)Predict class probabilities (classifiers)Array of probabilities
score(X, y)Evaluate the model on X with true labels yAccuracy (classifiers) or R² (regressors)

This consistency means once you learn one algorithm’s API, you know all of them. Swapping Random Forest for SVM requires changing just one line of code.

2. Standard ML Workflow in Scikit-learn


from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load data
X, y = load_iris(return_X_y=True)

# Step 2: Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 3: Preprocess
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # fit on train only!
X_test_scaled  = scaler.transform(X_test)        # transform test with train params

# Step 4: Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Step 5: Predict
y_pred = model.predict(X_test_scaled)

# Step 6: Evaluate
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred,
      target_names=load_iris().target_names))
    

3. Preprocessing


from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler,
    LabelEncoder, OneHotEncoder, OrdinalEncoder
)
from sklearn.impute import SimpleImputer, KNNImputer
import numpy as np

# --- Feature Scaling ---
X = np.array([[1, 100], [2, 200], [3, 300]])

# StandardScaler: mean=0, std=1
ss = StandardScaler()
print("StandardScaler:\n", ss.fit_transform(X))

# MinMaxScaler: range [0, 1]
mms = MinMaxScaler()
print("MinMaxScaler:\n", mms.fit_transform(X))

# RobustScaler: robust to outliers (uses median/IQR)
rs = RobustScaler()
print("RobustScaler:\n", rs.fit_transform(X))

# --- Handling Missing Values ---
X_missing = np.array([[1, 2], [np.nan, 3], [7, 6], [4, np.nan]])

# Mean imputation
imputer = SimpleImputer(strategy='mean')
print("After imputation:\n", imputer.fit_transform(X_missing))

# KNN imputation (better for structured data)
knn_imputer = KNNImputer(n_neighbors=2)
print("After KNN imputation:\n", knn_imputer.fit_transform(X_missing))

# --- Encoding Categorical Variables ---
# Label Encoding (ordinal categories)
le = LabelEncoder()
y_cat = ['Low', 'High', 'Medium', 'High', 'Low']
print("Label Encoded:", le.fit_transform(y_cat))

# One-Hot Encoding (nominal categories)
ohe = OneHotEncoder(sparse_output=False)
X_cat = [['Mumbai'], ['Delhi'], ['Chennai'], ['Mumbai']]
print("One-Hot Encoded:\n", ohe.fit_transform(X_cat))
print("Categories:", ohe.categories_)
    

4. All Major Algorithms — Quick Reference


from sklearn.linear_model import (LinearRegression, LogisticRegression,
                                   Ridge, Lasso, ElasticNet)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import (RandomForestClassifier, RandomForestRegressor,
                               GradientBoostingClassifier, AdaBoostClassifier)
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier

# --- CLASSIFICATION ---
classifiers = {
    'Logistic Regression':    LogisticRegression(max_iter=1000),
    'Decision Tree':          DecisionTreeClassifier(max_depth=5),
    'Random Forest':          RandomForestClassifier(n_estimators=100),
    'Gradient Boosting':      GradientBoostingClassifier(n_estimators=100),
    'SVM (RBF)':              SVC(kernel='rbf', probability=True),
    'KNN':                    KNeighborsClassifier(n_neighbors=5),
    'Naive Bayes':            GaussianNB(),
    'Neural Network':         MLPClassifier(hidden_layer_sizes=(100,50))
}

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import cross_val_score
X, y = load_breast_cancer(return_X_y=True)

print("Classifier Comparison (10-fold CV Accuracy):")
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    print(f"  {name:<25}: {scores.mean():.3f} ± {scores.std():.3f}")

# --- REGRESSION ---
from sklearn.datasets import load_diabetes
X_reg, y_reg = load_diabetes(return_X_y=True)

regressors = {
    'Linear Regression':   LinearRegression(),
    'Ridge':               Ridge(alpha=1.0),
    'Lasso':               Lasso(alpha=0.1),
    'Random Forest':       RandomForestRegressor(n_estimators=100),
    'Gradient Boosting':   GradientBoostingClassifier()
}

# --- CLUSTERING ---
from sklearn.datasets import make_blobs
X_cl, _ = make_blobs(n_samples=300, centers=4, random_state=42)

kmeans = KMeans(n_clusters=4, random_state=42)
labels = kmeans.fit_predict(X_cl)
print(f"\nK-Means cluster sizes: {np.bincount(labels)}")

# --- DIMENSIONALITY REDUCTION ---
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X)
print(f"\nPCA: {X.shape} → {X_2d.shape}")
print(f"Explained variance: {pca.explained_variance_ratio_}")
    

5. Pipelines — The Right Way


from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
import pandas as pd

# Simple Pipeline — prevents data leakage automatically
pipeline = Pipeline([
    ('imputer',  SimpleImputer(strategy='median')),
    ('scaler',   StandardScaler()),
    ('model',    RandomForestClassifier(n_estimators=100))
])

# Even simpler with make_pipeline
pipeline2 = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler(),
    RandomForestClassifier(n_estimators=100)
)

# --- ColumnTransformer: different preprocessing per column type ---
# (for mixed numerical + categorical data)
numerical_features = ['age', 'income', 'score']
categorical_features = ['city', 'education']

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',  StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100))
])

# The pipeline handles all preprocessing correctly within CV
# cross_val_score(full_pipeline, X_df, y, cv=10)  # No data leakage!
    

6. Model Selection & Hyperparameter Tuning


from sklearn.model_selection import (cross_val_score, GridSearchCV,
                                      RandomizedSearchCV, StratifiedKFold)
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_breast_cancer
from scipy.stats import randint

X, y = load_breast_cancer(return_X_y=True)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# --- Cross-Validation ---
model = RandomForestClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')
print(f"CV AUC: {scores.mean():.3f} ± {scores.std():.3f}")

# --- GridSearchCV (exhaustive) ---
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,   # Use all CPU cores
    verbose=1
)
grid_search.fit(X, y)
print(f"\nBest params: {grid_search.best_params_}")
print(f"Best AUC:    {grid_search.best_score_:.3f}")

# --- RandomizedSearchCV (faster for large param spaces) ---
param_dist = {
    'n_estimators': randint(50, 500),
    'max_depth': [None, 5, 10, 20, 30],
    'min_samples_leaf': randint(1, 20)
}
rand_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_dist,
    n_iter=50,   # Try 50 random combinations
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=42
)
rand_search.fit(X, y)
print(f"\nBest params (random): {rand_search.best_params_}")
    

7. Evaluation Metrics


from sklearn.metrics import (
    # Classification
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report,
    # Regression
    mean_absolute_error, mean_squared_error, r2_score
)
import numpy as np

# Classification metrics
y_true = [1, 0, 1, 1, 0, 1, 0, 0, 1, 0]
y_pred = [1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
y_prob = [0.9, 0.2, 0.8, 0.4, 0.1, 0.7, 0.6, 0.3, 0.85, 0.15]

print(f"Accuracy:  {accuracy_score(y_true, y_pred):.3f}")
print(f"Precision: {precision_score(y_true, y_pred):.3f}")
print(f"Recall:    {recall_score(y_true, y_pred):.3f}")
print(f"F1:        {f1_score(y_true, y_pred):.3f}")
print(f"AUC-ROC:   {roc_auc_score(y_true, y_prob):.3f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_true, y_pred))
print("\nClassification Report:")
print(classification_report(y_true, y_pred))

# Regression metrics
y_true_r = [3.0, 5.0, 2.5, 7.0, 4.0]
y_pred_r = [2.8, 5.2, 2.0, 6.5, 4.3]
print(f"\nMAE:  {mean_absolute_error(y_true_r, y_pred_r):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_true_r, y_pred_r)):.4f}")
print(f"R²:   {r2_score(y_true_r, y_pred_r):.4f}")
    

8. Quick Reference Cheatsheet

TaskImportClass
Linear Regressionsklearn.linear_modelLinearRegression()
Logistic Regressionsklearn.linear_modelLogisticRegression()
Ridge Regressionsklearn.linear_modelRidge(alpha=1.0)
Lasso Regressionsklearn.linear_modelLasso(alpha=0.1)
Decision Treesklearn.treeDecisionTreeClassifier()
Random Forestsklearn.ensembleRandomForestClassifier()
Gradient Boostingsklearn.ensembleGradientBoostingClassifier()
SVMsklearn.svmSVC(kernel='rbf')
KNNsklearn.neighborsKNeighborsClassifier(n_neighbors=5)
Naive Bayessklearn.naive_bayesGaussianNB()
K-Meanssklearn.clusterKMeans(n_clusters=3)
PCAsklearn.decompositionPCA(n_components=2)
Standard Scalersklearn.preprocessingStandardScaler()
One-Hot Encodersklearn.preprocessingOneHotEncoder()
Train-Test Splitsklearn.model_selectiontrain_test_split(X, y, test_size=0.2)
Cross-Validationsklearn.model_selectioncross_val_score(model, X, y, cv=10)
Grid Searchsklearn.model_selectionGridSearchCV(model, param_grid, cv=5)

Next Steps

Leave a Comment