scientific-skills/scikit-learn/references/preprocessing.md
Data preprocessing transforms raw data into a format suitable for machine learning models. This includes scaling, encoding, handling missing values, and feature engineering.
StandardScaler (sklearn.preprocessing.StandardScaler)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Use same parameters as training
# Access learned parameters
print(f"Mean: {scaler.mean_}")
print(f"Std: {scaler.scale_}")
MinMaxScaler (sklearn.preprocessing.MinMaxScaler)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X_train)
# Custom range
scaler = MinMaxScaler(feature_range=(-1, 1))
X_scaled = scaler.fit_transform(X_train)
RobustScaler (sklearn.preprocessing.RobustScaler)
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_train)
Normalizer (sklearn.preprocessing.Normalizer)
from sklearn.preprocessing import Normalizer
normalizer = Normalizer(norm='l2') # Euclidean norm
X_normalized = normalizer.fit_transform(X)
MaxAbsScaler (sklearn.preprocessing.MaxAbsScaler)
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X_sparse)
OneHotEncoder (sklearn.preprocessing.OneHotEncoder)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_encoded = encoder.fit_transform(X_categorical)
# Get feature names
feature_names = encoder.get_feature_names_out(['color', 'size'])
# Handle unknown categories during transform
X_test_encoded = encoder.transform(X_test_categorical)
OrdinalEncoder (sklearn.preprocessing.OrdinalEncoder)
from sklearn.preprocessing import OrdinalEncoder
# Natural ordering
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X_categorical)
# Custom ordering
encoder = OrdinalEncoder(categories=[['small', 'medium', 'large']])
X_encoded = encoder.fit_transform(X_categorical)
LabelEncoder (sklearn.preprocessing.LabelEncoder)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)
# Decode back
y_decoded = le.inverse_transform(y_encoded)
print(f"Classes: {le.classes_}")
# Install: uv pip install category-encoders
from category_encoders import TargetEncoder
encoder = TargetEncoder()
X_train_encoded = encoder.fit_transform(X_train_categorical, y_train)
X_test_encoded = encoder.transform(X_test_categorical)
PowerTransformer
from sklearn.preprocessing import PowerTransformer
# Yeo-Johnson (handles negative values)
pt = PowerTransformer(method='yeo-johnson', standardize=True)
X_transformed = pt.fit_transform(X)
# Box-Cox (positive values only)
pt = PowerTransformer(method='box-cox', standardize=True)
X_transformed = pt.fit_transform(X)
QuantileTransformer
from sklearn.preprocessing import QuantileTransformer
# Transform to uniform distribution
qt = QuantileTransformer(output_distribution='uniform', random_state=42)
X_transformed = qt.fit_transform(X)
# Transform to normal distribution
qt = QuantileTransformer(output_distribution='normal', random_state=42)
X_transformed = qt.fit_transform(X)
import numpy as np
# Log1p (log(1 + x)) - handles zeros
X_log = np.log1p(X)
# Or use FunctionTransformer
from sklearn.preprocessing import FunctionTransformer
log_transformer = FunctionTransformer(np.log1p, inverse_func=np.expm1)
X_log = log_transformer.fit_transform(X)
SimpleImputer (sklearn.impute.SimpleImputer)
from sklearn.impute import SimpleImputer
# For numerical features
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
# For categorical features
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X_categorical)
# Fill with constant
imputer = SimpleImputer(strategy='constant', fill_value=0)
X_imputed = imputer.fit_transform(X)
IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=42)
X_imputed = imputer.fit_transform(X)
KNNImputer
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)
X_imputed = imputer.fit_transform(X)
PolynomialFeatures
from sklearn.preprocessing import PolynomialFeatures
# Degree 2: includes x1, x2, x1^2, x2^2, x1*x2
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
# Get feature names
feature_names = poly.get_feature_names_out(['x1', 'x2'])
# Only interactions (no powers)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interactions = poly.fit_transform(X)
KBinsDiscretizer
from sklearn.preprocessing import KBinsDiscretizer
# Equal-width bins
binner = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
X_binned = binner.fit_transform(X)
# Equal-frequency bins (quantile-based)
binner = KBinsDiscretizer(n_bins=5, encode='onehot', strategy='quantile')
X_binned = binner.fit_transform(X)
Binarizer
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.5)
X_binary = binarizer.fit_transform(X)
SplineTransformer
from sklearn.preprocessing import SplineTransformer
spline = SplineTransformer(n_knots=5, degree=3)
X_splines = spline.fit_transform(X)
CountVectorizer (sklearn.feature_extraction.text.CountVectorizer)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(
max_features=5000, # Keep top 5000 features
min_df=2, # Ignore terms appearing in < 2 documents
max_df=0.8, # Ignore terms appearing in > 80% documents
ngram_range=(1, 2) # Unigrams and bigrams
)
X_counts = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
max_features=5000,
min_df=2,
max_df=0.8,
ngram_range=(1, 2),
stop_words='english' # Remove English stop words
)
X_tfidf = vectorizer.fit_transform(documents)
HashingVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer(n_features=2**18)
X_hashed = vectorizer.transform(documents) # No fit needed
Variance Threshold
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0.01)
X_selected = selector.fit_transform(X)
SelectKBest / SelectPercentile
from sklearn.feature_selection import SelectKBest, f_classif
# Select top 10 features
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X_train, y_train)
# Get selected feature indices
selected_indices = selector.get_support(indices=True)
Recursive Feature Elimination (RFE)
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
rfe = RFE(estimator=model, n_features_to_select=10, step=1)
X_selected = rfe.fit_transform(X_train, y_train)
# Get selected features
selected_features = rfe.support_
feature_ranking = rfe.ranking_
RFECV (with Cross-Validation)
from sklearn.feature_selection import RFECV
model = RandomForestClassifier(n_estimators=100, random_state=42)
rfecv = RFECV(estimator=model, cv=5, scoring='accuracy')
X_selected = rfecv.fit_transform(X_train, y_train)
print(f"Optimal number of features: {rfecv.n_features_}")
SelectFromModel
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
selector = SelectFromModel(model, threshold='median')
selector.fit(X_train, y_train)
X_selected = selector.transform(X_train)
# Get selected features
selected_features = selector.get_support()
L1-based Feature Selection
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
model = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
selector = SelectFromModel(model)
selector.fit(X_train, y_train)
X_selected = selector.transform(X_train)
import numpy as np
Q1 = np.percentile(X, 25, axis=0)
Q3 = np.percentile(X, 75, axis=0)
IQR = Q3 - Q1
# Define outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Remove outliers
mask = np.all((X >= lower_bound) & (X <= upper_bound), axis=1)
X_no_outliers = X[mask]
from scipy.stats import mstats
# Clip outliers at 5th and 95th percentiles
X_winsorized = mstats.winsorize(X, limits=[0.05, 0.05], axis=0)
from sklearn.preprocessing import FunctionTransformer
import numpy as np
def log_transform(X):
return np.log1p(X)
transformer = FunctionTransformer(log_transform, inverse_func=np.expm1)
X_transformed = transformer.fit_transform(X)
from sklearn.base import BaseEstimator, TransformerMixin
class CustomTransformer(BaseEstimator, TransformerMixin):
def __init__(self, parameter=1):
self.parameter = parameter
def fit(self, X, y=None):
# Learn parameters from X if needed
return self
def transform(self, X):
# Transform X
return X * self.parameter
transformer = CustomTransformer(parameter=2)
X_transformed = transformer.fit_transform(X)
Always fit transformers on training data only:
# Correct
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Wrong - causes data leakage
scaler = StandardScaler()
X_all_scaled = scaler.fit_transform(np.vstack([X_train, X_test]))
Combine preprocessing with models:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression())
])
pipeline.fit(X_train, y_train)
Use ColumnTransformer:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
numeric_features = ['age', 'income']
categorical_features = ['gender', 'occupation']
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), numeric_features),
('cat', OneHotEncoder(), categorical_features)
]
)
X_transformed = preprocessor.fit_transform(X)
Require Scaling:
Don't Require Scaling:
Encoding Requirements: