# Install PiNN & download QM9 dataset
!pip install git+https://github.com/Teoroo-CMC/PiNN
!mkdir -p /tmp/dsgdb9nsd && curl -sSL https://ndownloader.figshare.com/files/3195389 | tar xj -C /tmp/dsgdb9nsd
import os, warnings
import tensorflow as tf
import matplotlib.pyplot as plt
from glob import glob
from pinn.io import load_qm9, sparse_batch
from pinn.networks.pinet import PiNet
from pinn.utils import get_atomic_dress
from pinn import get_model, get_network
os.environ['CUDA_VISIBLE_DEVICES'] = ''
index_warning = 'Converting sparse IndexedSlices'
warnings.filterwarnings('ignore', index_warning)
# For the purpose of testing, we use only 1000 samples from QM9
filelist = glob('/tmp/dsgdb9nsd/*.xyz')[:1000]
dataset = lambda: load_qm9(filelist)
ds = dataset().repeat().apply(sparse_batch(100))
tensors = ds.as_numpy_iterator()
for i in range(10):
next(tensors) # "Warm up" the graph
%timeit next(tensors)
51.5 ms ± 3.49 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
This speed indicates the IO limit of our current setting.
Now let's cache the dataset to the memory.
ds = dataset().cache().repeat().apply(sparse_batch(100))
tensors = ds.as_numpy_iterator()
for i in range(10):
next(tensors) # "Warm up" the graph
%timeit next(tensors)
385 µs ± 35.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Preprocessing¶
You might also see a notable difference in the performance with and without preprocessing. This is especially helpful when you are training with GPUs.
pinet = PiNet()
ds = dataset().cache().repeat().apply(sparse_batch(100))
tensors = ds.as_numpy_iterator()
for i in range(10):
pinet(next(tensors)) # "Warm up" the graph
%timeit pinet(next(tensors))
89 ms ± 4.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
pinet = PiNet()
ds = dataset().cache().repeat().apply(sparse_batch(100)).map(pinet.preprocess)
tensors = ds.as_numpy_iterator()
for i in range(10):
next(tensors) # "Warm up" the graph
%timeit next(tensors)
WARNING:tensorflow:From /home/yunqi/.miniconda/envs/pinn-tf2/lib/python3.9/site-packages/tensorflow/python/ops/array_ops.py:5043: calling gather (from tensorflow.python.ops.array_ops) with validate_indices is deprecated and will be removed in a future version. Instructions for updating: The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU. 545 µs ± 38.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
You can even cache the preprocessed data.
pinet = PiNet()
ds = dataset().apply(sparse_batch(100)).map(pinet.preprocess).cache().repeat()
tensors = ds.as_numpy_iterator()
for i in range(10):
next(tensors) # "Warm up" the graph
%timeit next(tensors)
289 µs ± 2.58 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Atomic dress¶
Scaling and aligning the labels can enhance the performance of the models, and avoid numerical instability. For datasets like QM9, we can assign an atomic energy to each atom according to their elements to approximate the total energy. This can be done by a simple linear regression. We provide a simple tool to generate such "atomic dresses".
filelist = glob('/home/yunqi/datasets/QM9/dsgdb9nsd/*.xyz')
dataset = lambda: load_qm9(filelist, splits={'train':8, 'test':2})
dress, error = get_atomic_dress(dataset()['train'],[1,6,7,8,9])
Applying the atomic dress converts the QM9 energies to a "normal" distribution. It also gives us some ideas about the relative distribution of energies, and how much our neural network improves from the naive guess of the atomic dress.
After applying the atomic dress, it turns out that the distribution of our training set is only about 0.05 Hartree, or 30 kcal/mol.
plt.hist(error,50)
dress
{1: -0.6037799981310462, 6: -38.07402501506576, 7: -54.74923962293649, 8: -75.2255233345936, 9: -99.86678682702703}
Training with the optimized pipeline¶
!rm -rf /tmp/PiNet_QM9_pipeline
params = {'model_dir': '/tmp/PiNet_QM9_pipeline',
'network': {
'name': 'PiNet',
'params': {
'atom_types':[1, 6, 7, 8, 9],
},
},
'model': {
'name': 'potential_model',
'params': {
'learning_rate': 1e-3, # Relatively large learning rate
'e_scale': 627.5, # Here we scale the model to kcal/mol
'e_dress': dress
}
}
}
# The logging behavior of estimator can be controlled here
config = tf.estimator.RunConfig(log_step_count_steps=500)
# Preprocessing the datasets
model = get_model(params, config=config)
# If you are pre-processing the dataset in the training script,
# the preprocessing layer will occupy the namespace of the network
# resulting unexpected names in the ckpts and errors durning prediction
# To avoid this, wrap your preprocessing function with a name_scope.
# This will not be a problem if you save a preprocessed dataset
def pre_fn(tensors):
with tf.name_scope("PRE") as scope:
network = get_network(model.params['network'])
tensors = network.preprocess(tensors)
return tensors
train = lambda: dataset()['train'].apply(sparse_batch(100)).map(pre_fn).cache().repeat().shuffle(100)
test = lambda: dataset()['test'].apply(sparse_batch(100))
# Running specs
train_spec = tf.estimator.TrainSpec(input_fn=train, max_steps=1e4)
eval_spec = tf.estimator.EvalSpec(input_fn=test, steps=100)
INFO:tensorflow:Using config: {'_model_dir': '/tmp/PiNet_QM9_pipeline', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true graph_options { rewrite_options { meta_optimizer_iterations: ONE } } , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 500, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
tf.estimator.train_and_evaluate(model, train_spec, eval_spec)
INFO:tensorflow:Not using Distribute Coordinator. INFO:tensorflow:Running training and evaluation locally (non-distributed). INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600. INFO:tensorflow:Calling model_fn. 12112 trainable vaiabless, training with float32 precision. INFO:tensorflow:Done calling model_fn. INFO:tensorflow:Create CheckpointSaverHook. INFO:tensorflow:Graph was finalized. INFO:tensorflow:Running local_init_op. INFO:tensorflow:Done running local_init_op. INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0... INFO:tensorflow:Saving checkpoints for 0 into /tmp/PiNet_QM9_pipeline/model.ckpt. INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0... INFO:tensorflow:loss = 1608.7036, step = 0 INFO:tensorflow:global_step/sec: 11.2424 INFO:tensorflow:loss = 309.28052, step = 500 (44.477 sec) INFO:tensorflow:global_step/sec: 11.6739 INFO:tensorflow:loss = 147.40509, step = 1000 (42.830 sec) INFO:tensorflow:global_step/sec: 25.8236 INFO:tensorflow:loss = 115.164055, step = 1500 (19.362 sec) INFO:tensorflow:global_step/sec: 26.4694 INFO:tensorflow:loss = 126.90699, step = 2000 (18.894 sec) INFO:tensorflow:global_step/sec: 26.1443 INFO:tensorflow:loss = 103.33997, step = 2500 (19.120 sec) INFO:tensorflow:global_step/sec: 26.1268 INFO:tensorflow:loss = 96.97985, step = 3000 (19.137 sec) INFO:tensorflow:global_step/sec: 25.9872 INFO:tensorflow:loss = 107.959435, step = 3500 (19.241 sec) INFO:tensorflow:global_step/sec: 26.0982 INFO:tensorflow:loss = 83.18972, step = 4000 (19.158 sec) INFO:tensorflow:global_step/sec: 26.2075 INFO:tensorflow:loss = 70.3028, step = 4500 (19.080 sec) INFO:tensorflow:global_step/sec: 25.9199 INFO:tensorflow:loss = 84.25394, step = 5000 (19.289 sec) INFO:tensorflow:global_step/sec: 26.4121 INFO:tensorflow:loss = 129.86829, step = 5500 (18.930 sec) INFO:tensorflow:global_step/sec: 25.8288 INFO:tensorflow:loss = 132.20454, step = 6000 (19.359 sec) INFO:tensorflow:global_step/sec: 26.261 INFO:tensorflow:loss = 69.64721, step = 6500 (19.038 sec) INFO:tensorflow:global_step/sec: 26.1977 INFO:tensorflow:loss = 62.85822, step = 7000 (19.086 sec) INFO:tensorflow:global_step/sec: 26.0748 INFO:tensorflow:loss = 69.52461, step = 7500 (19.176 sec) INFO:tensorflow:global_step/sec: 26.3489 INFO:tensorflow:loss = 93.84022, step = 8000 (18.975 sec) INFO:tensorflow:global_step/sec: 25.3495 INFO:tensorflow:loss = 97.3127, step = 8500 (19.724 sec) INFO:tensorflow:global_step/sec: 25.7534 INFO:tensorflow:loss = 43.729958, step = 9000 (19.416 sec) INFO:tensorflow:global_step/sec: 26.3466 INFO:tensorflow:loss = 41.565964, step = 9500 (18.977 sec) INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 10000... INFO:tensorflow:Saving checkpoints for 10000 into /tmp/PiNet_QM9_pipeline/model.ckpt. INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 10000... INFO:tensorflow:Calling model_fn. INFO:tensorflow:Done calling model_fn. INFO:tensorflow:Starting evaluation at 2021-05-31T15:01:59 INFO:tensorflow:Graph was finalized. INFO:tensorflow:Restoring parameters from /tmp/PiNet_QM9_pipeline/model.ckpt-10000 INFO:tensorflow:Running local_init_op. INFO:tensorflow:Done running local_init_op. INFO:tensorflow:Evaluation [10/100] INFO:tensorflow:Evaluation [20/100] INFO:tensorflow:Evaluation [30/100] INFO:tensorflow:Evaluation [40/100] INFO:tensorflow:Evaluation [50/100] INFO:tensorflow:Evaluation [60/100] INFO:tensorflow:Evaluation [70/100] INFO:tensorflow:Evaluation [80/100] INFO:tensorflow:Evaluation [90/100] INFO:tensorflow:Evaluation [100/100] INFO:tensorflow:Inference Time : 10.84179s INFO:tensorflow:Finished evaluation at 2021-05-31-15:02:10 INFO:tensorflow:Saving dict for global step 10000: METRICS/E_LOSS = 71.01845, METRICS/E_MAE = 5.8880224, METRICS/E_RMSE = 8.427245, global_step = 10000, loss = 71.01845 INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10000: /tmp/PiNet_QM9_pipeline/model.ckpt-10000 INFO:tensorflow:Loss for final step: 82.67876.
({'METRICS/E_LOSS': 71.01845, 'METRICS/E_MAE': 5.8880224, 'METRICS/E_RMSE': 8.427245, 'loss': 71.01845, 'global_step': 10000}, [])
Monitoring¶
It's recommended to monitor the training with Tensorboard instead of the stdout here.
Try tensorboard --logdir /tmp
Parallelization with tf.Estimator¶
The estimator api makes it extremely easy to train on multiple GPUs.
# suppose you have two cards
distribution = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
config = tf.estimator.RunConfig(train_distribute=distribution)
Conclusions¶
Congratulations! You can now train atomic neural networks with state-of-the-art accuracy and speed.
But there's more. With PiNN, the components of ANNs are modulized. Read the following notebooks to see how you can build your own ANN.