快速开始
QlibRL 提供了一个单资产订单执行任务的实现示例,以下是用于使用 QlibRL 进行训练的配置文件示例。
simulator:
# Each step contains 30mins
time_per_step: 30
# Upper bound of volume, should be null or a float between 0 and 1, if it is a float, represent upper bound is calculated by the percentage of the market volume
vol_limit: null
env:
# Concurrent environment workers.
concurrency: 1
# dummy or subproc or shmem. Corresponding to `parallelism in tianshou <https://tianshou.readthedocs.io/en/master/api/tianshou.env.html#vectorenv>`_.
parallel_mode: dummy
action_interpreter:
class: CategoricalActionInterpreter
kwargs:
# Candidate actions, it can be a list with length L: [a_1, a_2,..., a_L] or an integer n, in which case the list of length n+1 is auto-generated, i.e., [0, 1/n, 2/n,..., n/n].
values: 14
# Total number of steps (an upper-bound estimation)
max_step: 8
module_path: qlib.rl.order_execution.interpreter
state_interpreter:
class: FullHistoryStateInterpreter
kwargs:
# Number of dimensions in data.
data_dim: 6
# Equal to the total number of records. For example, in SAOE per minute, data_ticks is the length of the day in minutes.
data_ticks: 240
# The total number of steps (an upper-bound estimation). For example, 390min / 30min-per-step = 13 steps.
max_step: 8
# Provider of the processed data.
processed_data_provider:
class: PickleProcessedDataProvider
module_path: qlib.rl.data.pickle_styled
kwargs:
data_dir: ./data/pickle_dataframe/feature
module_path: qlib.rl.order_execution.interpreter
reward:
class: PAPenaltyReward
kwargs:
# The penalty for a large volume in a short time.
penalty: 100.0
module_path: qlib.rl.order_execution.reward
data:
source:
order_dir: ./data/training_order_split
data_dir: ./data/pickle_dataframe/backtest
# number of time indexes
total_time: 240
# start time index
default_start_time: 0
# end time index
default_end_time: 240
proc_data_dim: 6
num_workers: 0
queue_size: 20
network:
class: Recurrent
module_path: qlib.rl.order_execution.network
policy:
class: PPO
kwargs:
lr: 0.0001
module_path: qlib.rl.order_execution.policy
runtime:
seed: 42
use_cuda: false
trainer:
max_epoch: 2
# Number of episodes collected in each training iteration
repeat_per_collect: 5
earlystop_patience: 2
# Episodes per collect at training.
episode_per_collect: 20
batch_size: 16
# Perform validation every n iterations
val_every_n_epoch: 1
checkpoint_path: ./checkpoints
checkpoint_every_n_iters: 1
回测的配置文件如下:
order_file: ./data/backtest_orders.csv
start_time: "9:45"
end_time: "14:44"
qlib:
provider_uri_1min: ./data/bin
feature_root_dir: ./data/pickle
# feature generated by today's information
feature_columns_today: [
"$open", "$high", "$low", "$close", "$vwap", "$volume",
]
# feature generated by yesterday's information
feature_columns_yesterday: [
"$open_v1", "$high_v1", "$low_v1", "$close_v1", "$vwap_v1", "$volume_v1",
]
exchange:
# the expression for buying and selling stock limitation
limit_threshold: ['$close == 0', '$close == 0']
# deal price for buying and selling
deal_price: ["If($close == 0, $vwap, $close)", "If($close == 0, $vwap, $close)"]
volume_threshold:
# volume limits are both buying and selling, "cum" means that this is a cumulative value over time
all: ["cum", "0.2 * DayCumsum($volume, '9:45', '14:44')"]
# the volume limits of buying
buy: ["current", "$close"]
# the volume limits of selling, "current" means that this is a real-time value and will not accumulate over time
sell: ["current", "$close"]
strategies:
30min:
class: TWAPStrategy
module_path: qlib.contrib.strategy.rule_strategy
kwargs: {}
1day:
class: SAOEIntStrategy
module_path: qlib.rl.order_execution.strategy
kwargs:
state_interpreter:
class: FullHistoryStateInterpreter
module_path: qlib.rl.order_execution.interpreter
kwargs:
max_step: 8
data_ticks: 240
data_dim: 6
processed_data_provider:
class: PickleProcessedDataProvider
module_path: qlib.rl.data.pickle_styled
kwargs:
data_dir: ./data/pickle_dataframe/feature
action_interpreter:
class: CategoricalActionInterpreter
module_path: qlib.rl.order_execution.interpreter
kwargs:
values: 14
max_step: 8
network:
class: Recurrent
module_path: qlib.rl.order_execution.network
kwargs: {}
policy:
class: PPO
module_path: qlib.rl.order_execution.policy
kwargs:
lr: 1.0e-4
# Local path to the latest model. The model is generated during training, so please run training first if you want to run backtest with a trained policy. You could also remove this parameter file to run backtest with a randomly initialized policy.
weight_file: ./checkpoints/latest.pth
# Concurrent environment workers.
concurrency: 5
使用上述配置文件,您可以通过以下命令启动训练:
$ python -m qlib.rl.contrib.train_onpolicy.py --config_path train_config.yml
训练完成后,您可以通过以下命令进行回测:
$ python -m qlib.rl.contrib.backtest.py --config_path backtest_config.yml
在此示例中,SingleAssetOrderExecution
和 SingleAssetOrderExecutionSimple
作为模拟器的示例,qlib.rl.order_execution.interpreter.FullHistoryStateInterpreter
和 qlib.rl.order_execution.interpreter.CategoricalActionInterpreter
作为解释器的示例,qlib.rl.order_execution.policy.PPO
作为策略的示例,qlib.rl.order_execution.reward.PAPenaltyReward
作为奖励函数的示例。对于单资产订单执行任务,如果开发者已定义了自己的模拟器/解释器/奖励函数/策略,只需修改配置文件中的相应设置即可启动训练和回测流程。有关此示例的详细信息,请参阅此处。
未来,我们将提供更多针对不同场景的示例,例如基于强化学习的投资组合构建。