examples/jupyter/integrations/sklearn.ipynb
import modin.pandas as pd
import numpy as np
# From https://www.ritchieng.com/pandas-scikit-learn/
url = 'http://bit.ly/kaggletrain'
train = pd.read_csv(url)
train.head()
# Pclass: passenger class
# Parch: parents and children
feature_cols = ['Pclass', 'Parch']
# you want all rows, and the feature_cols' columns
X = train.loc[:, feature_cols]
# now we want to create our response vector
y = train.Survived
# 1. import
from sklearn.linear_model import LogisticRegression
# 2. instantiate model
logreg = LogisticRegression()
# 3. fit
logreg.fit(X, y)
url_test = 'http://bit.ly/kaggletest'
test = pd.read_csv(url_test)
# missing Survived column because we are predicting
test.head()
X_new = test.loc[:, feature_cols]
# 4. predict
new_pred_class = logreg.predict(X_new)
# kaggle wants 2 columns
# new_pred_class
# PassengerId
# pandas would align them next to each other
# to ensure the first column is PassengerId, use .set_index
kaggle_data = pd.DataFrame({'PassengerId':test.PassengerId, 'Survived':new_pred_class}).set_index('PassengerId')
kaggle_data.to_csv('sub.csv')
# save train data to disk using pickle
train.to_pickle('train.pkl')
# read data
pd.read_pickle('train.pkl')
# From https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
ct = ColumnTransformer(
[("norm1", Normalizer(norm='l1'), [0, 1]),
("norm2", Normalizer(norm='l1'), slice(2, 4))])
X = pd.DataFrame(np.array([[0., 1., 2., 2.],
[1., 1., 0., 1.]]))
# Normalizer scales each row of X to unit norm. A separate scaling
# is applied for the two first and two last elements of each
# row independently.
ct.fit_transform(X)
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import MinMaxScaler
X = pd.DataFrame({
"documents": ["First item", "second one here", "Is this the last?"],
"width": [3, 4, 5],
})
ct = ColumnTransformer(
[("text_preprocess", FeatureHasher(input_type="string"), "documents"),
("num_preprocess", MinMaxScaler(), ["width"])])
X_trans = ct.fit_transform(X)
# From https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(pd.DataFrame([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]))
X = pd.DataFrame([[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]])
print(imp_mean.transform(X))
# From https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
import numpy as np
from sklearn.model_selection import train_test_split
X, y = pd.DataFrame(np.arange(10).reshape((5, 2))), pd.Series(range(5))
X
list(y)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)
type(X_train)
train_test_split(y, shuffle=False)
import seaborn as sns
tips = sns.load_dataset("tips")
tips = pd.DataFrame(tips)
pd.get_dummies(tips, drop_first=True)
from sklearn import linear_model
# 1. create the model object
lr = linear_model.LinearRegression()
# 2. fit the model object
lr.fit(X=tips[["total_bill", "size"]], y=tips["tip"])
# look at the coefficients
lr.coef_
# look at the intercept
lr.intercept_
tips_dummy = pd.get_dummies(tips, drop_first=True)[["tip", "total_bill", "smoker_No"]]
tips_dummy
lr2 = linear_model.LinearRegression()
lr2.fit(X=tips_dummy.iloc[:, 1:], y=tips_dummy["tip"])
lr2.coef_, lr2.intercept_
new_data = tips_dummy[["total_bill", "smoker_No"]].tail() # not really new data
new_data
# use the model to give predicted tip values
new_data["predicted_tips"] = lr2.predict(new_data)
new_data
type(new_data)