python

#  Copyright (c) Microsoft Corporation.
#  Licensed under the MIT License.

Introduction

Though users can automatically run the whole Quant research worklfow based on configurations with Qlib.

Some advanced users usally would like to carefully customize each component to explore more in Quant.

If you just want a simple example of Qlib. Quick start and workflow_by_code may be a better choice for you.

If you want to know more details about Quant research, this notebook may be a better place for you to start.

We hope this script could be a tutorial for users who are interested in the details of Quant.

This notebook tries to demonstrate how can we use Qlib to build components step by step.

python

from pprint import pprint
from pathlib import Path
import pandas as pd

python

MARKET = "csi300"
BENCHMARK = "SH000300"
EXP_NAME = "tutorial_exp"

Data

Get data

Users can follow the steps to download data with CLI.

In this example we use the underlying API to automatically download data

python

from qlib.tests.data import GetData

GetData().qlib_data(exists_skip=True)

python

import qlib

qlib.init()

Inspect raw data

Currently, Qlib support several kinds of data source.

Calendar

python

from qlib.data import D

print(D.calendar(start_time="2010-01-01", end_time="2017-12-31", freq="day")[:2])  # calendar data

Basic data

python

df = D.features(
    ["SH601216"],
    ["$open", "$high", "$low", "$close", "$factor"],
    start_time="2020-05-01",
    end_time="2020-05-31",
)

python

import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "notebook"
fig = go.Figure(
    data=[
        go.Candlestick(
            x=df.index.get_level_values("datetime"),
            open=df["$open"],
            high=df["$high"],
            low=df["$low"],
            close=df["$close"],
        )
    ]
)
fig.show()

price adjustment

Maybe you think the price is not what it looks like in real world.

Due to the price adjustment, the price will be different from the real trading data .

python

import plotly.graph_objects as go

fig = go.Figure(
    data=[
        go.Candlestick(
            x=df.index.get_level_values("datetime"),
            open=df["$open"] / df["$factor"],
            high=df["$high"] / df["$factor"],
            low=df["$low"] / df["$factor"],
            close=df["$close"] / df["$factor"],
        )
    ]
)
fig.show()

Please notice the price gap on 2020-05-26

If we want to represent the change of assets value by price, adjust prices are necesary. By default, Qlib stores the adjusted prices.

Static universe V.S. dynamic universe

Dynamic universe

python

# dynamic universe
universe = D.list_instruments(D.instruments("csi100"), start_time="2010-01-01", end_time="2020-12-31")
pprint(universe)

python

print(len(universe))

Qlib use dynamic universe by default.

csi100 has around 100 stocks each day(it is not that accurate due to the low precision of data).

python

df = D.features(D.instruments("csi100"), ["$close"], start_time="2010-01-01", end_time="2020-12-31")
df.groupby("datetime").size().plot()

Point-In-Time data

download data

NOTE: To run the test faster, we only download the data of two stocks

python

p = Path("~/.qlib/qlib_data/cn_data/financial").expanduser()

python

if not p.exists():
    !cd ../../scripts/data_collector/pit/ && pip install -r requirements.txt
    !cd ../../scripts/data_collector/pit/ && python collector.py download_data --source_dir ~/.qlib/stock_data/source/pit --start 2000-01-01 --end 2020-01-01 --interval quarterly --symbol_regex "^(600519|000725).*"
    !cd ../../scripts/data_collector/pit/ && python collector.py normalize_data --interval quarterly --source_dir ~/.qlib/stock_data/source/pit --normalize_dir ~/.qlib/stock_data/source/pit_normalized
    !cd ../../scripts/ && python dump_pit.py dump --csv_path ~/.qlib/stock_data/source/pit_normalized --qlib_dir ~/.qlib/qlib_data/cn_data --interval quarterly

querying data

using roewa(performanceExpressROEWa,业绩快报净资产收益率ROE-加权) as an example

If we want to get fundamental data in the most recent quarter daily, we can use following example.

Maitai release part of its fundamental data on 2019-07-13 and release others on 2019-07-18

python

instruments = ["sh600519"]
data = D.features(
    instruments,
    ["P($$roewa_q)"],
    start_time="2019-01-01",
    end_time="2019-07-19",
    freq="day",
)

python

data.tail(15)

experss engine

python

D.features(
    ["sh600519"],
    ["(EMA($close, 12) - EMA($close, 26))/$close - EMA((EMA($close, 12) - EMA($close, 26))/$close, 9)/$close"],
)

Dataset loading and preprocessing

Some heuristic principles of create features

make the features comparable between instrumets: remove unit from the features.
try to keep the distribution invariant
keep the scale of features similar

data loader

It's interface can be found here

QlibDataLoader is an implementation which load data from Qlib's data source

python

from qlib.data.dataset.loader import QlibDataLoader

python

qdl = QlibDataLoader(config=(["$close / Ref($close, 10)"], ["RET10"]))

python

qdl.load(instruments=["sh600519"], start_time="20190101", end_time="20191231")

data handler

finance data can't be perfect.

We have to process them before feeding them into Models

python

df = qdl.load(instruments=["sh600519"], start_time="20190101", end_time="20191231")

python

df.isna().sum()

python

df.plot(kind="hist")

Datahander is responsible for data preprocessing and provides data fetching interface

python

from qlib.data.dataset.handler import DataHandlerLP
from qlib.data.dataset.processor import ZScoreNorm, Fillna

python

# NOTE: normally, the training & validation time range will be  `fit_start_time` ， `fit_end_time`
# however，all the components are decomposed, so the training & validation time range is unknown when preprocessing.
dh = DataHandlerLP(
    instruments=["sh600519"],
    start_time="20170101",
    end_time="20191231",
    infer_processors=[
        ZScoreNorm(fit_start_time="20170101", fit_end_time="20181231"),
        Fillna(),
    ],
    data_loader=qdl,
)

python

df = dh.fetch()

python

df

python

df.isna().sum()

python

df.plot(kind="hist")

dataset

basic dataset

python

from qlib.data.dataset import DatasetH, TSDatasetH

python

ds = DatasetH(dh, segments={"train": ("20180101", "20181231"), "valid": ("20190101", "20191231")})

python

ds.prepare("train")

python

ds.prepare("valid")

Time Series Dataset

For different model, the required dataset format will be different.

For example, Qlib provides a Time Series Dataset(TSDatasetH) to help users to create time-series dataset.

python

ds = TSDatasetH(
    step_len=10,
    handler=dh,
    segments={"train": ("20180101", "20181231"), "valid": ("20190101", "20191231")},
)
train_sampler = ds.prepare("train")

python

train_sampler

python

train_sampler[0]  # Retrieving the first example

python

train_sampler["2018-01-08", "sh600519"]  # get the time series by <'timestamp', 'instrument_id'> index

Off-the-shelf dataset

Qlib integrated some dataset alreadly

python

handler_kwargs = {
    "start_time": "2008-01-01",
    "end_time": "2020-08-01",
    "fit_start_time": "2008-01-01",
    "fit_end_time": "2014-12-31",
    "instruments": MARKET,
}
handler_conf = {
    "class": "Alpha158",
    "module_path": "qlib.contrib.data.handler",
    "kwargs": handler_kwargs,
}
pprint(handler_conf)

python

from qlib.utils import init_instance_by_config

python

hd = init_instance_by_config(handler_conf)

Using config to create instance is a highly frequently used practice in Qlib (e.g. the workflows configurations are based on it).

The above configuration is the same as the code below

python

from qlib.contrib.data.handler import Alpha158

hd = Alpha158(**handler_kwargs)

This dataset has the same structure as the simple one with 1 column we created just now.

python

df = hd.fetch()

python

df

python

hd.data_loader

python

hd.data_loader.fields

some details

The training data may not be the same as the test data.

e.g.

the training dataset and test dataset use a different fitlering rules, data processing

python

hd.learn_processors

python

hd.infer_processors

python

hd.process_type  # appending type

python

hd.fetch(col_set="label", data_key=hd.DK_L)

python

hd.fetch(col_set="label", data_key=hd.DK_I)

python

dataset_conf = {
    "class": "DatasetH",
    "module_path": "qlib.data.dataset",
    "kwargs": {
        "handler": hd,
        "segments": {
            "train": ("2008-01-01", "2014-12-31"),
            "valid": ("2015-01-01", "2016-12-31"),
            "test": ("2017-01-01", "2020-08-01"),
        },
    },
}

python

dataset = init_instance_by_config(dataset_conf)

Model Training & Inference

Model interface

python

from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord, SigAnaRecord

python

model = init_instance_by_config(
    {
        "class": "LGBModel",
        "module_path": "qlib.contrib.model.gbdt",
        "kwargs": {
            "loss": "mse",
            "colsample_bytree": 0.8879,
            "learning_rate": 0.0421,
            "subsample": 0.8789,
            "lambda_l1": 205.6999,
            "lambda_l2": 580.9768,
            "max_depth": 8,
            "num_leaves": 210,
            "num_threads": 20,
        },
    }
)

python

# start exp to train model
with R.start(experiment_name=EXP_NAME):
    model.fit(dataset)
    R.save_objects(trained_model=model)

    rec = R.get_recorder()
    rid = rec.id  # save the record id

    # Inference and saving signal
    sr = SignalRecord(model, dataset, rec)
    sr.generate()

Evaluation:

Signal-based
Portfolio-based: backtest

python

###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "signal": "<PRED>",
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2017-01-01",
        "end_time": "2020-08-01",
        "account": 100000000,
        "benchmark": BENCHMARK,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name=EXP_NAME, recorder_id=rid, resume=True):
    # signal-based analysis
    rec = R.get_recorder()
    sar = SigAnaRecord(rec)
    sar.generate()

    #  portfolio-based analysis: backtest
    par = PortAnaRecord(rec, port_analysis_config, "day")
    par.generate()

Loading results & Analysis

loading data

Because Qlib leverage MLflow to save model & data. All the data can be access by mlflow ui

python

# load recorder
recorder = R.get_recorder(recorder_id=rid, experiment_name=EXP_NAME)

python

# load previous results
pred_df = recorder.load_object("pred.pkl")
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

python

# Previous Model can be loaded. but it is not used.
loaded_model = recorder.load_object("trained_model")
loaded_model

python

from qlib.contrib.report import analysis_model, analysis_position

analysis position

report

python

analysis_position.report_graph(report_normal_df)

risk analysis

python

analysis_position.risk_analysis_graph(analysis_df, report_normal_df)

analysis model

python

label_df = dataset.prepare("test", col_set="label")
label_df.columns = ["label"]

score IC

python

pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)
analysis_position.score_ic_graph(pred_label)

model performance

python

analysis_model.model_performance_graph(pred_label)