website/versioned_docs/version-1.0.4/Explore Algorithms/LightGBM/Quickstart - Classification, Ranking, and Regression.md
LightGBM is an open-source, distributed, high-performance gradient boosting (GBDT, GBRT, GBM, or MART) framework. This framework specializes in creating high-quality and GPU enabled decision tree algorithms for ranking, classification, and many other machine learning tasks. LightGBM is part of Microsoft's DMTK project.
In this example, we use LightGBM to build a classification model in order to predict bankruptcy.
from synapse.ml.core.platform import *
df = (
spark.read.format("csv")
.option("header", True)
.option("inferSchema", True)
.load(
"wasbs://[email protected]/company_bankruptcy_prediction_data.csv"
)
)
# print dataset size
print("records read: " + str(df.count()))
print("Schema: ")
df.printSchema()
display(df)
train, test = df.randomSplit([0.85, 0.15], seed=1)
from pyspark.ml.feature import VectorAssembler
feature_cols = df.columns[1:]
featurizer = VectorAssembler(inputCols=feature_cols, outputCol="features")
train_data = featurizer.transform(train)["Bankrupt?", "features"]
test_data = featurizer.transform(test)["Bankrupt?", "features"]
display(train_data.groupBy("Bankrupt?").count())
from synapse.ml.lightgbm import LightGBMClassifier
model = LightGBMClassifier(
objective="binary", featuresCol="features", labelCol="Bankrupt?", isUnbalance=True
)
model = model.fit(train_data)
"saveNativeModel" allows you to extract the underlying lightGBM model for fast deployment after you train on Spark.
from synapse.ml.lightgbm import LightGBMClassificationModel
if running_on_synapse():
model.saveNativeModel("/models/lgbmclassifier.model")
model = LightGBMClassificationModel.loadNativeModelFromFile(
"/models/lgbmclassifier.model"
)
if running_on_synapse_internal():
model.saveNativeModel("Files/models/lgbmclassifier.model")
model = LightGBMClassificationModel.loadNativeModelFromFile(
"Files/models/lgbmclassifier.model"
)
else:
model.saveNativeModel("/tmp/lgbmclassifier.model")
model = LightGBMClassificationModel.loadNativeModelFromFile(
"/tmp/lgbmclassifier.model"
)
import pandas as pd
import matplotlib.pyplot as plt
feature_importances = model.getFeatureImportances()
fi = pd.Series(feature_importances, index=feature_cols)
fi = fi.sort_values(ascending=True)
f_index = fi.index
f_values = fi.values
# print feature importances
print("f_index:", f_index)
print("f_values:", f_values)
# plot
x_index = list(range(len(fi)))
x_index = [x / len(fi) for x in x_index]
plt.rcParams["figure.figsize"] = (20, 20)
plt.barh(
x_index, f_values, height=0.028, align="center", color="tan", tick_label=f_index
)
plt.xlabel("importances")
plt.ylabel("features")
plt.show()
predictions = model.transform(test_data)
predictions.limit(10).toPandas()
from synapse.ml.train import ComputeModelStatistics
metrics = ComputeModelStatistics(
evaluationMetric="classification",
labelCol="Bankrupt?",
scoredLabelsCol="prediction",
).transform(predictions)
display(metrics)
In this example, we show how to use LightGBM to build a regression model.
triazines = spark.read.format("libsvm").load(
"wasbs://[email protected]/triazines.scale.svmlight"
)
# print some basic info
print("records read: " + str(triazines.count()))
print("Schema: ")
triazines.printSchema()
display(triazines.limit(10))
train, test = triazines.randomSplit([0.85, 0.15], seed=1)
from synapse.ml.lightgbm import LightGBMRegressor
model = LightGBMRegressor(
objective="quantile", alpha=0.2, learningRate=0.3, numLeaves=31
).fit(train)
print(model.getFeatureImportances())
scoredData = model.transform(test)
display(scoredData)
from synapse.ml.train import ComputeModelStatistics
metrics = ComputeModelStatistics(
evaluationMetric="regression", labelCol="label", scoresCol="prediction"
).transform(scoredData)
display(metrics)
df = spark.read.format("parquet").load(
"wasbs://[email protected]/lightGBMRanker_train.parquet"
)
# print some basic info
print("records read: " + str(df.count()))
print("Schema: ")
df.printSchema()
display(df.limit(10))
from synapse.ml.lightgbm import LightGBMRanker
features_col = "features"
query_col = "query"
label_col = "labels"
lgbm_ranker = LightGBMRanker(
labelCol=label_col,
featuresCol=features_col,
groupCol=query_col,
predictionCol="preds",
leafPredictionCol="leafPreds",
featuresShapCol="importances",
repartitionByGroupingColumn=True,
numLeaves=32,
numIterations=200,
evalAt=[1, 3, 5],
metric="ndcg",
)
lgbm_ranker_model = lgbm_ranker.fit(df)
dt = spark.read.format("parquet").load(
"wasbs://[email protected]/lightGBMRanker_test.parquet"
)
predictions = lgbm_ranker_model.transform(dt)
predictions.limit(10).toPandas()