# Copyright (c) DP Technology.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from __future__ import absolute_import, division, print_function
import logging
import copy
import os
import argparse
import json
import numpy as np
import pandas as pd
import joblib
from .data import DataHub
from .models import NNModel
from .tasks import Trainer
from .utils import YamlHandler
from .utils import logger
[docs]
class MolTrain(object):
"""A :class:`MolTrain` class is responsible for interface of training process of molecular data."""
[docs]
def __init__(self,
task='classification',
data_type='molecule',
epochs=10,
learning_rate=1e-4,
batch_size=16,
early_stopping=5,
metrics= "none",
split='random', # random, scaffold, group, stratified
split_group_col='scaffold', # only active with group split
kfold=5,
save_path='./exp',
remove_hs=False,
smiles_col='SMILES',
target_cols=None,
target_col_prefix='TARGET',
target_anomaly_check="filter",
smiles_check="filter",
target_normalize="auto",
max_norm=5.0,
use_cuda=True,
use_amp=True,
freeze_layers=None,
freeze_layers_reversed=False,
load_model_dir=None, # load model for transfer learning
model_name='unimolv1',
model_size='84m',
**params,
):
"""
Initialize a :class:`MolTrain` class.
:param task: str, default='classification', currently support [`]classification`, `regression`, `multiclass`, `multilabel_classification`, `multilabel_regression`.
:param data_type: str, default='molecule', currently support molecule, oled.
:param epochs: int, default=10, number of epochs to train.
:param learning_rate: float, default=1e-4, learning rate of optimizer.
:param batch_size: int, default=16, batch size of training.
:param early_stopping: int, default=5, early stopping patience.
:param metrics: str, default='none', metrics to evaluate model performance.
currently support:
- classification: auc, auprc, log_loss, acc, f1_score, mcc, precision, recall, cohen_kappa.
- regression: mse, pearsonr, spearmanr, mse, r2.
- multiclass: log_loss, acc.
- multilabel_classification: auc, auprc, log_loss, acc, mcc.
- multilabel_regression: mae, mse, r2.
:param split: str, default='random', split method of training dataset. currently support: random, scaffold, group, stratified, select.
- random: random split.
- scaffold: split by scaffold.
- group: split by group. `split_group_col` should be specified.
- stratified: stratified split. `split_group_col` should be specified.
- select: use `split_group_col` to manually select the split group. Column values of `split_group_col` should be range from 0 to kfold-1 to indicate the split group.
:param split_group_col: str, default='scaffold', column name of group split.
:param kfold: int, default=5, number of folds for k-fold cross validation.
- 1: no split. all data will be used for training.
:param save_path: str, default='./exp', path to save training results.
:param remove_hs: bool, default=False, whether to remove hydrogens from molecules.
:param smiles_col: str, default='SMILES', column name of SMILES.
:param target_cols: list or str, default=None, column names of target values.
:param target_col_prefix: str, default='TARGET', prefix of target column name.
:param target_anomaly_check: str, default='filter', how to deal with anomaly target values. currently support: filter, none.
:param smiles_check: str, default='filter', how to deal with invalid SMILES. currently support: filter, none.
:param target_normalize: str, default='auto', how to normalize target values. 'auto' means we will choose the normalize strategy by automatic. \
currently support: auto, minmax, standard, robust, log1p, none.
:param max_norm: float, default=5.0, max norm of gradient clipping.
:param use_cuda: bool, default=True, whether to use GPU.
:param use_amp: bool, default=True, whether to use automatic mixed precision.
:param freeze_layers: str or list, frozen layers by startwith name list. ['encoder', 'gbf'] will freeze all the layers whose name start with 'encoder' or 'gbf'.
:param freeze_layers_reversed: bool, default=False, inverse selection of frozen layers
:param params: dict, default=None, other parameters.
:param load_model_dir: str, default=None, path to load model for transfer learning.
:param model_name: str, default='unimolv1', currently support unimolv1, unimolv2.
:param model_size: str, default='84m', model size. work when model_name is unimolv2. avaliable: 84m, 164m, 310m, 570m, 1.1B.
"""
if load_model_dir is not None:
config_path = os.path.join(load_model_dir, 'config.yaml')
logger.info('Load config file from {}'.format(config_path))
else:
config_path = os.path.join(os.path.dirname(__file__), 'config/default.yaml')
self.yamlhandler = YamlHandler(config_path)
config = self.yamlhandler.read_yaml()
config.task = task
config.data_type = data_type
config.epochs = epochs
config.learning_rate = learning_rate
config.batch_size = batch_size
config.patience = early_stopping
config.metrics = metrics
config.split = split
config.split_group_col = split_group_col
config.kfold = kfold
config.remove_hs = remove_hs
config.smiles_col = smiles_col
config.target_cols = target_cols
config.target_col_prefix = target_col_prefix
config.anomaly_clean = target_anomaly_check in ['filter']
config.smi_strict = smiles_check in ['filter']
config.target_normalize = target_normalize
config.max_norm = max_norm
config.use_cuda = use_cuda
config.use_amp = use_amp
config.freeze_layers = freeze_layers
config.freeze_layers_reversed = freeze_layers_reversed
config.load_model_dir = load_model_dir
config.model_name = model_name
config.model_size = model_size
self.save_path = save_path
self.config = config
[docs]
def fit(self, data):
"""
Fit the model according to the given training data with multi datasource support, including SMILES csv file and custom coordinate data.
For example: custom coordinate data.
.. code-block:: python
from unimol_tools import MolTrain
import numpy as np
custom_data ={'target':np.random.randint(2, size=100),
'atoms':[['C','C','H','H','H','H'] for _ in range(100)],
'coordinates':[np.random.randn(6,3) for _ in range(100)],
}
clf = MolTrain()
clf.fit(custom_data)
"""
self.datahub = DataHub(data = data, is_train=True, save_path=self.save_path, **self.config)
self.data = self.datahub.data
self.update_and_save_config()
self.trainer = Trainer(save_path=self.save_path, **self.config)
self.model = NNModel(self.data, self.trainer, **self.config)
self.model.run()
scalar = self.data['target_scaler']
y_pred = self.model.cv['pred']
y_true = np.array(self.data['target'])
metrics = self.trainer.metrics
if scalar is not None:
y_pred = scalar.inverse_transform(y_pred)
y_true = scalar.inverse_transform(y_true)
if self.config["task"] in ['classification', 'multilabel_classification']:
threshold = metrics.calculate_classification_threshold(y_true, y_pred)
joblib.dump(threshold, os.path.join(self.save_path, 'threshold.dat'))
self.cv_pred = y_pred
return
[docs]
def update_and_save_config(self):
"""
Update and save config file.
"""
self.config['num_classes'] = self.data['num_classes']
self.config['target_cols'] = ','.join(self.data['target_cols'])
if self.config['task'] == 'multiclass':
self.config['multiclass_cnt'] = self.data['multiclass_cnt']
self.config['split_method'] = f"{self.config['kfold']}fold_{self.config['split']}"
if self.save_path is not None:
if not os.path.exists(self.save_path):
logger.info('Create output directory: {}'.format(self.save_path))
os.makedirs(self.save_path)
else:
logger.info('Output directory already exists: {}'.format(self.save_path))
logger.info('Warning: Overwrite output directory: {}'.format(self.save_path))
out_path = os.path.join(self.save_path, 'config.yaml')
self.yamlhandler.write_yaml(data = self.config, out_file_path = out_path)
return