mority commited on Dec 13, 2025

Commit

6e7d4ba

verified ·

1 Parent(s): efff3ad

Upload 53 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
LICENSE +9 -0
configs/sampling/sample_and_maybe_evaluate.yml +25 -0
configs/sampling/sample_train_split.yml +25 -0
configs/training/drugflow.yml +82 -0
configs/training/drugflow_no_virtual_nodes.yml +82 -0
configs/training/drugflow_ood.yml +83 -0
configs/training/flexflow.yml +90 -0
configs/training/preference_alignment.yml +93 -0
docs/drugflow.jpg +3 -0
environment.yaml +30 -0
examples/kras.pdb +0 -0
examples/kras_ref_ligand.sdf +74 -0
scripts/python/evaluate_baselines.py +53 -0
scripts/python/postprocess_metrics.py +271 -0
src/analysis/SA_Score/README.md +1 -0
src/analysis/SA_Score/fpscores.pkl.gz +3 -0
src/analysis/SA_Score/sascorer.py +173 -0
src/analysis/metrics.py +544 -0
src/analysis/visualization_utils.py +192 -0
src/constants.py +256 -0
src/data/data_utils.py +901 -0
src/data/dataset.py +208 -0
src/data/misc.py +19 -0
src/data/molecule_builder.py +107 -0
src/data/nerf.py +250 -0
src/data/normal_modes.py +69 -0
src/data/postprocessing.py +93 -0
src/data/process_crossdocked.py +176 -0
src/data/process_dpo_dataset.py +406 -0
src/data/sanifix.py +159 -0
src/data/so3_utils.py +450 -0
src/default/size_distribution.npy +3 -0
src/generate.py +204 -0
src/model/diffusion_utils.py +206 -0
src/model/dpo.py +252 -0
src/model/dynamics.py +791 -0
src/model/dynamics_hetero.py +1008 -0
src/model/flows.py +448 -0
src/model/gvp.py +650 -0
src/model/gvp_transformer.py +471 -0
src/model/lightning.py +1426 -0
src/model/loss_utils.py +79 -0
src/model/markov_bridge.py +163 -0
src/sample_and_evaluate.py +164 -0
src/sbdd_metrics/evaluation.py +239 -0
src/sbdd_metrics/fpscores.pkl.gz +3 -0
src/sbdd_metrics/interactions.py +231 -0
src/sbdd_metrics/metrics.py +929 -0
src/sbdd_metrics/sascorer.py +173 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/drugflow.jpg filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,9 @@

+MIT License
+Copyright (c) 2025 Arne Schneuing, Ilia Igashov, Adrian Dobbelstein
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

configs/sampling/sample_and_maybe_evaluate.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+checkpoint: <TODO>
+set: test
+sample_outdir: ./samples
+n_samples: 100
+sample_with_ground_truth_size: False
+device: cuda
+seed: 42
+sample: True
+postprocess: False
+evaluate: False
+reduce: reduce
+# Override training config parameters if necessary
+model_args:
+  virtual_nodes: [0, 5]
+  train_params:
+    datadir: ./processed_crossdocked
+    gnina: gnina
+  eval_params:
+    n_sampling_steps: 500
+    eval_batch_size: 1

configs/sampling/sample_train_split.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+checkpoint: <TODO>
+set: train
+sample_outdir: ./samples
+n_samples: 50
+sample_with_ground_truth_size: False
+device: cuda
+seed: 42
+sample: True
+postprocess: False
+evaluate: False
+reduce: reduce
+# Override training config parameters if necessary
+model_args:
+  virtual_nodes: [0, 10]
+  train_params:
+    datadir: ./processed_crossdocked
+    gnina: gnina
+    batch_size: 2
+  eval_params:
+    n_sampling_steps: 100

configs/training/drugflow.yml ADDED Viewed

	@@ -0,0 +1,82 @@

+run_name: drugflow # iclr_drugflow_T5000
+pocket_representation: CA+
+virtual_nodes: [0, 10]
+flexible: False
+flexible_bb: False
+train_params:
+  logdir: ./runs # symlink to any location you like
+  datadir: ./processed_crossdocked # symlink to the dataset location
+  enable_progress_bar: True
+  num_sanity_val_steps: 0
+  batch_size: 64
+  accumulate_grad_batches: 2
+  lr: 5.0e-4
+  n_epochs: 1000
+  num_workers: 0
+  gpus: 1
+  clip_grad: True
+  gnina: gnina
+  sample_from_clusters: False
+  sharded_dataset: False
+wandb_params:
+  mode: online  # disabled, offline, online
+  entity:
+  group: crossdocked
+loss_params:
+  discrete_loss: VLB  # VLB or CE
+  lambda_x: 1.0
+  lambda_h: 50.0
+  lambda_e: 50.0
+  lambda_chi: null
+  lambda_trans: null
+  lambda_rot: null
+  lambda_clash: null
+  timestep_weights: null
+simulation_params:
+  n_steps: 5000
+  prior_h: marginal # uniform, marginal
+  prior_e: uniform # uniform, marginal
+  predict_final: False
+  predict_confidence: False
+eval_params:
+  eval_epochs: 100
+  n_eval_samples: 4
+  n_sampling_steps: 500
+  eval_batch_size: 16
+  visualize_sample_epoch: 1
+  n_visualize_samples: 100
+  visualize_chain_epoch: 1
+  keep_frames: 100
+  sample_with_ground_truth_size: True
+predictor_params:
+  heterogeneous_graph: True
+  backbone: gvp
+  num_rbf_time: 16
+  edge_cutoff_ligand: null
+  edge_cutoff_pocket: 10.0
+  edge_cutoff_interaction: 10.0
+  cycle_counts: True
+  spectral_feat: False
+  reflection_equivariant: False
+  num_rbf: 16
+  d_max: 15.0
+  self_conditioning: True
+  augment_residue_sc: False
+  augment_ligand_sc: False
+  normal_modes: False
+  add_chi_as_feature: False
+  angle_act_fn: null
+  add_all_atom_diff: False
+  gvp_params:
+    n_layers: 5
+    node_h_dim: [ 128, 32 ]  # (s, V)
+    edge_h_dim: [ 128, 32 ]
+    dropout: 0.0
+    vector_gate: True

configs/training/drugflow_no_virtual_nodes.yml ADDED Viewed

	@@ -0,0 +1,82 @@

+run_name: drugflow_no_virtual_nodes # iclr_drugflow_T5000_no_virtual_nodes
+pocket_representation: CA+
+virtual_nodes: null
+flexible: False
+flexible_bb: False
+train_params:
+  logdir: ./runs # symlink to any location you like
+  datadir: ./processed_crossdocked # symlink to the dataset location
+  enable_progress_bar: True
+  num_sanity_val_steps: 0
+  batch_size: 64
+  accumulate_grad_batches: 2
+  lr: 5.0e-4
+  n_epochs: 1000
+  num_workers: 0
+  gpus: 1
+  clip_grad: True
+  gnina: gnina
+  sample_from_clusters: False
+  sharded_dataset: False
+wandb_params:
+  mode: online  # disabled, offline, online
+  entity: lpdi
+  group: crossdocked
+loss_params:
+  discrete_loss: VLB  # VLB or CE
+  lambda_x: 1.0
+  lambda_h: 50.0
+  lambda_e: 50.0
+  lambda_chi: null
+  lambda_trans: null
+  lambda_rot: null
+  lambda_clash: null
+  timestep_weights: null
+simulation_params:
+  n_steps: 5000
+  prior_h: marginal # uniform, marginal
+  prior_e: uniform # uniform, marginal
+  predict_final: False
+  predict_confidence: False
+eval_params:
+  eval_epochs: 100
+  n_eval_samples: 4
+  n_sampling_steps: 500
+  eval_batch_size: 16
+  visualize_sample_epoch: 1
+  n_visualize_samples: 100
+  visualize_chain_epoch: 1
+  keep_frames: 100
+  sample_with_ground_truth_size: True
+predictor_params:
+  heterogeneous_graph: True
+  backbone: gvp
+  num_rbf_time: 16
+  edge_cutoff_ligand: null
+  edge_cutoff_pocket: 10.0
+  edge_cutoff_interaction: 10.0
+  cycle_counts: True
+  spectral_feat: False
+  reflection_equivariant: False
+  num_rbf: 16
+  d_max: 15.0
+  self_conditioning: True
+  augment_residue_sc: False
+  augment_ligand_sc: False
+  normal_modes: False
+  add_chi_as_feature: False
+  angle_act_fn: null
+  add_all_atom_diff: False
+  gvp_params:
+    n_layers: 5
+    node_h_dim: [ 128, 32 ]  # (s, V)
+    edge_h_dim: [ 128, 32 ]
+    dropout: 0.0
+    vector_gate: True

configs/training/drugflow_ood.yml ADDED Viewed

	@@ -0,0 +1,83 @@

+run_name: drugflow_ood # iclr_drugflow_T5000_confidence_ru10
+pocket_representation: CA+
+virtual_nodes: [0, 10]
+flexible: False
+flexible_bb: False
+train_params:
+  logdir: ./runs # symlink to any location you like
+  datadir: ./processed_crossdocked # symlink to the dataset location
+  enable_progress_bar: True
+  num_sanity_val_steps: 0
+  batch_size: 64
+  accumulate_grad_batches: 2
+  lr: 5.0e-4
+  n_epochs: 1000
+  num_workers: 0
+  gpus: 1
+  clip_grad: True
+  gnina: gnina
+  sample_from_clusters: False
+  sharded_dataset: False
+wandb_params:
+  mode: online  # disabled, offline, online
+  entity: lpdi
+  group: crossdocked
+loss_params:
+  discrete_loss: VLB  # VLB or CE
+  lambda_x: 1.0
+  lambda_h: 50.0
+  lambda_e: 50.0
+  lambda_chi: null
+  lambda_trans: null
+  lambda_rot: null
+  lambda_clash: null
+  timestep_weights: null
+  regularize_uncertainty: 10.0
+simulation_params:
+  n_steps: 5000
+  prior_h: marginal # uniform, marginal
+  prior_e: uniform # uniform, marginal
+  predict_final: False
+  predict_confidence: True
+eval_params:
+  eval_epochs: 100
+  n_eval_samples: 4
+  n_sampling_steps: 500
+  eval_batch_size: 16
+  visualize_sample_epoch: 1
+  n_visualize_samples: 100
+  visualize_chain_epoch: 1
+  keep_frames: 100
+  sample_with_ground_truth_size: True
+predictor_params:
+  heterogeneous_graph: True
+  backbone: gvp
+  num_rbf_time: 16
+  edge_cutoff_ligand: null
+  edge_cutoff_pocket: 10.0
+  edge_cutoff_interaction: 10.0
+  cycle_counts: True
+  spectral_feat: False
+  reflection_equivariant: False
+  num_rbf: 16
+  d_max: 15.0
+  self_conditioning: True
+  augment_residue_sc: False
+  augment_ligand_sc: False
+  normal_modes: False
+  add_chi_as_feature: False
+  angle_act_fn: null
+  add_all_atom_diff: False
+  gvp_params:
+    n_layers: 5
+    node_h_dim: [ 128, 32 ]  # (s, V)
+    edge_h_dim: [ 128, 32 ]
+    dropout: 0.0
+    vector_gate: True

configs/training/flexflow.yml ADDED Viewed

	@@ -0,0 +1,90 @@

+run_name: flexflow
+pocket_representation: CA+
+virtual_nodes: [0, 10]
+flexible: True
+flexible_bb: False
+train_params:
+  logdir: ./runs  # symlink to any location you like
+  datadir: ./processed_crossdocked # symlink to the dataset location
+  enable_progress_bar: False
+  num_sanity_val_steps: 0
+  batch_size: 64
+  accumulate_grad_batches: 2
+  lr: 5.0e-4
+  lr_step_size: null
+  lr_gamma: null
+  n_epochs: 700
+  num_workers: 4
+  gpus: 1
+  clip_grad: True
+  gnina: gnina  # add Gnina location to path
+  sample_from_clusters: False
+  sharded_dataset: False
+wandb_params:
+  mode: online  # disabled, offline, online
+  entity:
+  group: crossdocked
+loss_params:
+  discrete_loss: VLB  # VLB or CE
+  reduce: sum  # 'mean' or 'sum'
+  lambda_x: 0.015
+  lambda_h: 2.5
+  lambda_e: 0.25
+  lambda_chi: 0.002
+  lambda_trans: null
+  lambda_rot: null
+  lambda_clash: null
+  regularize_uncertainty: null
+  timestep_weights: null
+simulation_params:
+  n_steps: 5000
+  prior_h: marginal # uniform, marginal
+  prior_e: uniform # uniform, marginal
+  predict_final: False
+  predict_confidence: False
+  scheduler_chi:
+    type: polynomial
+    k: 3  # constant for exponential scheduler kappa(t)=(1-t)^k
+eval_params:
+  eval_epochs: 100
+  n_loss_per_sample: 100
+  n_eval_samples: 4
+  n_sampling_steps: 500
+  eval_batch_size: 16
+  visualize_sample_epoch: 1
+  n_visualize_samples: 100
+  visualize_chain_epoch: 1
+  keep_frames: 100
+  sample_with_ground_truth_size: True
+predictor_params:
+  heterogeneous_graph: True
+  backbone: gvp
+  num_rbf_time: 16
+  edge_cutoff_ligand: null
+  edge_cutoff_pocket: 10.0
+  edge_cutoff_interaction: 10.0
+  cycle_counts: True
+  spectral_feat: False
+  reflection_equivariant: False
+  num_rbf: 16
+  d_max: 15.0
+  self_conditioning: True
+  augment_residue_sc: False
+  augment_ligand_sc: False
+  normal_modes: False
+  add_chi_as_feature: False
+  angle_act_fn: null
+  add_all_atom_diff: True
+  gvp_params:
+    n_layers: 5
+    node_h_dim: [ 128, 32 ]  # (s, V)
+    edge_h_dim: [ 128, 32 ]
+    dropout: 0.0
+    vector_gate: True

configs/training/preference_alignment.yml ADDED Viewed

	@@ -0,0 +1,93 @@

+run_name: drugflow_preference_alignment
+checkpoint: ./reference.ckpt # TODO: specify reference checkpoint
+dpo_mode: single_dpo_comp_v3
+pocket_representation: CA+
+virtual_nodes: [0, 10]
+flexible: False
+flexible_bb: False
+train_params:
+  logdir: ./runs # symlink to any location you like
+  datadir: ./processed_crossdocked # symlink to the dataset location
+  enable_progress_bar: True
+  num_sanity_val_steps: 0
+  batch_size: 64
+  accumulate_grad_batches: 2
+  lr: 5.0e-5
+  n_epochs: 500
+  num_workers: 0
+  gpus: 1
+  clip_grad: True
+  gnina: gnina # path to gnina binary
+  sample_from_clusters: False
+  sharded_dataset: False
+wandb_params:
+  mode: online  # disabled, offline, online
+  entity:
+  group: crossdocked
+loss_params:
+  discrete_loss: VLB  # VLB or CE
+  lambda_x: 1.0
+  lambda_h: 500
+  dpo_lambda_h: 2500
+  lambda_e: 500
+  dpo_lambda_e: 2500
+  lambda_chi: 0.5  # only effective if flexible=True
+  lambda_trans: 1.0  # only effective if flexible_bb=True
+  lambda_rot: 0.1  # only effective if flexible_bb=True
+  lambda_clash: null
+  timestep_weights: null  # sigmoid_a=1_b=10  # null, sigmoid_a=?_b=?
+  dpo_beta: 100.0
+  dpo_beta_schedule: 't'
+  dpo_lambda_w: 1.0
+  dpo_lambda_l: 0.2
+  clamp_dpo: False
+simulation_params:
+  n_steps: 5000
+  prior_h: marginal # uniform, marginal
+  prior_e: uniform # uniform, marginal
+  predict_final: False
+  predict_confidence: False
+eval_params:
+  eval_epochs: 4
+  n_eval_samples: 1
+  n_sampling_steps: 500
+  eval_batch_size: 16
+  visualize_sample_epoch: 1
+  n_visualize_samples: 10
+  visualize_chain_epoch: 1
+  keep_frames: 100
+  sample_with_ground_truth_size: True
+predictor_params:
+  heterogeneous_graph: True
+  backbone: gvp
+  num_rbf_time: 16
+  edge_cutoff_ligand: null
+  edge_cutoff_pocket: 10.0
+  edge_cutoff_interaction: 10.0
+  cycle_counts: True
+  spectral_feat: False
+  reflection_equivariant: False
+  num_rbf: 16
+  d_max: 15.0
+  self_conditioning: True
+  augment_residue_sc: False
+  augment_ligand_sc: False
+  normal_modes: False
+  add_chi_as_feature: False
+  angle_act_fn: null
+  add_all_atom_diff: False
+  gvp_params:
+    n_layers: 5
+    node_h_dim: [ 128, 32 ]  # (s, V)
+    edge_h_dim: [ 128, 32 ]
+    dropout: 0.0
+    vector_gate: True

docs/drugflow.jpg ADDED Viewed

Git LFS Details

SHA256: c16f816eafa8e13658526b74f06f8ee5fb3258f51172c711ec1d1d539b48a4ef
Pointer size: 131 Bytes
Size of remote file: 762 kB

environment.yaml ADDED Viewed

	@@ -0,0 +1,30 @@

+name: sbdd
+channels:
+  - pytorch
+  - conda-forge
+  - anaconda
+  - pyg
+  - nvidia
+dependencies:
+  - python=3.11.8
+  - pytorch=2.2.1=*cuda12.1*
+  - pytorch-cuda=12.1
+  - pytorch-lightning=2.2.1
+  - rdkit=2023.09.6
+  - openbabel=3.1.1
+  - biopython=1.83
+  - scipy=1.12.0
+  - pyg=2.5.1
+  - pytorch-scatter=2.1.2
+  - ProDy=2.4.0
+  - wandb=0.16.3
+  - pandas=2.2.2
+  - pip=24.0
+  - pip:
+      - posebusters==0.3.1
+      - useful_rdkit_utils==0.65
+      - fcd==1.2.2
+      - webdataset==0.2.86
+      - prolif==2.0.3

examples/kras.pdb ADDED Viewed

The diff for this file is too large to render. See raw diff

examples/kras_ref_ligand.sdf ADDED Viewed

	@@ -0,0 +1,74 @@

+8AZR
+  PyMOL2.5          3D                             0
+ 32 36  0  0  0  0  0  0  0  0999 V2000
+   15.7084    1.6569    4.9428 C   0  0  0  0  0  0  0  0  0  0  0  0
+   16.2939    1.9182    6.3219 C   0  0  0  0  0  0  0  0  0  0  0  0
+   17.7757    1.5677    6.3468 C   0  0  0  0  0  0  0  0  0  0  0  0
+   18.0388    0.0580    6.1328 C   0  0  0  0  0  0  0  0  0  0  0  0
+   16.1458    0.3026    4.4709 C   0  0  0  0  0  0  0  0  0  0  0  0
+   17.1748   -0.4207    4.9854 C   0  0  0  0  0  0  0  0  0  0  0  0
+   17.2894   -1.6945    4.3617 C   0  0  0  0  0  0  0  0  0  0  0  0
+   16.3332   -1.9132    3.3763 C   0  0  0  0  0  0  0  0  0  0  0  0
+   15.2948   -0.5437    3.2188 S   0  0  0  0  0  0  0  0  0  0  0  0
+   17.6856   -0.7371    7.4005 C   0  0  0  0  0  0  0  0  0  0  0  0
+   19.5008   -0.1084    5.7694 C   0  0  0  0  0  0  0  0  0  0  0  0
+   19.9420    0.4778    4.6523 O   0  0  0  0  0  0  0  0  0  0  0  0
+   21.3366    0.1893    4.6052 N   0  0  0  0  0  0  0  0  0  0  0  0
+   21.5306   -0.5212    5.6843 C   0  0  0  0  0  0  0  0  0  0  0  0
+   20.3929   -0.7319    6.4483 N   0  0  0  0  0  0  0  0  0  0  0  0
+   16.1651   -3.0052    2.6033 N   0  0  0  0  0  0  0  0  0  0  0  0
+   22.8349   -1.0932    6.0768 C   0  0  0  0  0  0  0  0  0  0  0  0
+   23.9207   -0.6312    5.4365 N   0  0  0  0  0  0  0  0  0  0  0  0
+   25.1129   -1.1528    5.7755 C   0  0  0  0  0  0  0  0  0  0  0  0
+   25.2639   -2.1387    6.7500 C   0  0  0  0  0  0  0  0  0  0  0  0
+   24.1280   -2.5941    7.3940 C   0  0  0  0  0  0  0  0  0  0  0  0
+   22.8945   -2.0709    7.0591 C   0  0  0  0  0  0  0  0  0  0  0  0
+   18.2816   -2.6789    4.6625 C   0  0  0  0  0  0  0  0  0  0  0  0
+   19.0589   -3.4973    4.8688 N   0  0  0  0  0  0  0  0  0  0  0  0
+   26.1982   -0.6750    5.0820 N   0  0  0  0  0  0  0  0  0  0  0  0
+   26.0358    0.4071    4.0954 C   0  0  0  0  0  0  0  0  0  0  0  0
+   26.8978    0.1468    2.8491 C   0  0  0  0  0  0  0  0  0  0  0  0
+   28.2989   -0.1678    3.2648 N   0  0  0  0  0  0  0  0  0  0  0  0
+   28.3171   -1.4142    4.0851 C   0  0  0  0  0  0  0  0  0  0  0  0
+   27.5312   -1.2091    5.3777 C   0  0  0  0  0  0  0  0  0  0  0  0
+   29.1988   -0.2741    2.0804 C   0  0  0  0  0  0  0  0  0  0  0  0
+   26.3415    1.7618    4.7132 C   0  0  0  0  0  0  0  0  0  0  0  0
+  1  2  1  0  0  0  0
+  2  3  1  0  0  0  0
+  3  4  1  0  0  0  0
+  4  6  1  0  0  0  0
+  4 10  1  0  0  0  0
+  4 11  1  0  0  0  0
+  1  5  1  0  0  0  0
+  5  6  4  0  0  0  0
+  5  9  4  0  0  0  0
+  6  7  4  0  0  0  0
+  7  8  4  0  0  0  0
+  7 23  1  0  0  0  0
+  8  9  4  0  0  0  0
+  8 16  1  0  0  0  0
+ 11 12  4  0  0  0  0
+ 11 15  4  0  0  0  0
+ 12 13  4  0  0  0  0
+ 13 14  4  0  0  0  0
+ 14 15  4  0  0  0  0
+ 14 17  1  0  0  0  0
+ 17 18  4  0  0  0  0
+ 17 22  4  0  0  0  0
+ 18 19  4  0  0  0  0
+ 19 25  1  0  0  0  0
+ 19 20  4  0  0  0  0
+ 20 21  4  0  0  0  0
+ 21 22  4  0  0  0  0
+ 23 24  3  0  0  0  0
+ 25 26  1  0  0  0  0
+ 26 27  1  0  0  0  0
+ 26 32  1  0  0  0  0
+ 27 28  1  0  0  0  0
+ 28 29  1  0  0  0  0
+ 29 30  1  0  0  0  0
+ 25 30  1  0  0  0  0
+ 28 31  1  0  0  0  0
+M  END
+$$$$

scripts/python/evaluate_baselines.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import argparse
+import pickle
+import sys
+from pathlib import Path
+basedir = Path(__file__).resolve().parent.parent.parent
+sys.path.append(str(basedir))
+from src.sbdd_metrics.evaluation import compute_all_metrics_drugflow
+if __name__ == '__main__':
+    p = argparse.ArgumentParser()
+    p.add_argument('--in_dir', type=Path, required=True, help='Directory with samples')
+    p.add_argument('--out_dir', type=str, required=True, help='Output directory')
+    p.add_argument('--reference_smiles', type=str, default=None, help='Path to the .npy file with reference SMILES (optional)')
+    p.add_argument('--gnina', type=str, default=None, help='Path to the gnina binary file (optional)')
+    p.add_argument('--reduce', type=str, default=None, help='Path to the reduce binary file (optional)')
+    p.add_argument('--n_samples', type=int, default=None, help='Top-N sampels to evaluate (optional)')
+    p.add_argument('--exclude', type=str, nargs='+', default=[], help='Evaluator IDs to exclude')
+    p.add_argument('--job_id', type=int, default=0, help='Job ID')
+    p.add_argument('--n_jobs', type=int, default=1, help='Number of jobs')
+    args = p.parse_args()
+    Path(args.out_dir).mkdir(exist_ok=True, parents=True)
+    if args.job_id == 0 and args.n_jobs == 1:
+        out_detailed_table = Path(args.out_dir, 'metrics_detailed.csv')
+        out_aggregated_table = Path(args.out_dir, 'metrics_aggregated.csv')
+        out_distributions_file = Path(args.out_dir, 'metrics_data.pkl')
+    else:
+        out_detailed_table = Path(args.out_dir, f'metrics_detailed_{args.job_id}.csv')
+        out_aggregated_table = Path(args.out_dir, f'metrics_aggregated_{args.job_id}.csv')
+        out_distributions_file = Path(args.out_dir, f'metrics_data_{args.job_id}.pkl')
+    if out_detailed_table.exists() and out_aggregated_table.exists() and out_distributions_file.exists():
+        print(f'Data already exist. Terminating')
+        sys.exit(0)
+    print(f'Evaluating: {args.in_dir}')
+    data, detailed, aggregated = compute_all_metrics_drugflow(
+        in_dir=args.in_dir,
+        gnina_path=args.gnina,
+        reduce_path=args.reduce,
+        reference_smiles_path=args.reference_smiles,
+        n_samples=args.n_samples,
+        exclude_evaluators=args.exclude,
+        job_id=args.job_id,
+        n_jobs=args.n_jobs,
+    )
+    detailed.to_csv(out_detailed_table, index=False)
+    aggregated.to_csv(out_aggregated_table, index=False)
+    with open(Path(out_distributions_file), 'wb') as f:
+        pickle.dump(data, f)

scripts/python/postprocess_metrics.py ADDED Viewed

	@@ -0,0 +1,271 @@

+import argparse
+import os
+import pickle
+import sys
+from collections import Counter, defaultdict
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from rdkit import Chem
+from scipy.stats import wasserstein_distance
+from scipy.spatial.distance import jensenshannon
+from tqdm import tqdm
+basedir = Path(__file__).resolve().parent.parent.parent
+sys.path.append(str(basedir))
+from src.data.data_utils import atom_encoder, bond_encoder, encode_atom
+from src.sbdd_metrics.evaluation import VALIDITY_METRIC_NAME, aggregated_metrics, collection_metrics, get_data_type
+from src.sbdd_metrics.metrics import FullEvaluator
+DATA_TYPES = data_types = FullEvaluator().dtypes
+MEDCHEM_PROPS = [
+    'medchem.qed',
+    'medchem.sa',
+    'medchem.logp',
+    'medchem.lipinski',
+    'medchem.size',
+    'medchem.n_rotatable_bonds',
+    'energy.energy',
+]
+DOCKING_PROPS = [
+    'gnina.vina_score',
+    'gnina.gnina_score',
+    'gnina.vina_efficiency',
+    'gnina.gnina_efficiency',
+]
+RELEVANT_INTERACTIONS = [
+    'interactions.HBAcceptor',
+    'interactions.HBDonor',
+    'interactions.HB',
+    'interactions.PiStacking',
+    'interactions.Hydrophobic',
+    #
+    'interactions.HBAcceptor.normalized',
+    'interactions.HBDonor.normalized',
+    'interactions.HB.normalized',
+    'interactions.PiStacking.normalized',
+    'interactions.Hydrophobic.normalized'
+]
+def compute_discrete_distributions(smiles, name):
+    atom_counter = Counter()
+    bond_counter = Counter()
+    for smi in tqdm(smiles, desc=name):
+        mol = Chem.MolFromSmiles(smi)
+        mol = Chem.RemoveAllHs(mol, sanitize=False)
+        for atom in mol.GetAtoms():
+            try:
+                encoded_atom = encode_atom(atom, atom_encoder=atom_encoder)
+            except KeyError:
+                continue
+            atom_counter[encoded_atom] += 1
+        for bond in mol.GetBonds():
+            bond_counter[bond_encoder[str(bond.GetBondType())]] += 1
+    atom_distribution = np.zeros(len(atom_encoder))
+    bond_distribution = np.zeros(len(bond_encoder))
+    for k, v in atom_counter.items():
+        atom_distribution[k] = v
+    for k, v in bond_counter.items():
+        bond_distribution[k] = v
+    atom_distribution = atom_distribution / atom_distribution.sum()
+    bond_distribution = bond_distribution / bond_distribution.sum()
+    return atom_distribution, bond_distribution
+def flatten_distribution(data, name, table):
+    aux = ['sample', 'sdf_file', 'pdb_file']
+    method_distributions = defaultdict(list)
+    sdf2sample2size = defaultdict(dict)
+    for _, row in table.iterrows():
+        sdf2sample2size[row['sdf_file']][int(row['sample'])] = row['medchem.size']
+    for item in tqdm(data, desc=name):
+        if item['medchem.valid'] is not True:
+            continue
+        if 'interactions.HBAcceptor' in item and 'interactions.HBDonor' in item:
+            item['interactions.HB'] = item['interactions.HBAcceptor'] + item['interactions.HBDonor']
+        new_entries = {}
+        for key, value in item.items():
+            if key.startswith('interactions'):
+                size = sdf2sample2size.get(item['sdf_file'], dict()).get(int(item['sample']))
+                if size is not None:
+                    new_entries[key + '.normalized'] = value / size
+        item.update(new_entries)
+        for key, value in item.items():
+            if value is None:
+                continue
+            if key in aux:
+                continue
+            if key == 'energy.energy' and abs(value) > 1000:
+                continue
+            if get_data_type(key, DATA_TYPES, default=type(value)) == list:
+                method_distributions[key] += value
+            else:
+                method_distributions[key].append(value)
+    return method_distributions
+def prepare_baseline_data(root_path, baseline_name):
+    metrics_detailed = pd.read_csv(f'{root_path}/metrics_detailed.csv')
+    metrics_detailed = metrics_detailed[metrics_detailed['medchem.valid']]
+    distributions = pickle.load(open(f'{root_path}/metrics_data.pkl', 'rb'))
+    distributions = flatten_distribution(distributions, name=baseline_name, table=metrics_detailed)
+    distributions['energy.energy'] = [v for v in distributions['energy.energy'] if -1000 <= v <= 1000]
+    for prop in MEDCHEM_PROPS + DOCKING_PROPS:
+        distributions[prop] = metrics_detailed[prop].dropna().values.tolist()
+    smiles = metrics_detailed['representation.smiles']
+    atom_distribution, bond_distribution = compute_discrete_distributions(smiles, name=baseline_name)
+    discrete_distributions = {
+        'atom_types': atom_distribution,
+        'bond_types': bond_distribution,
+    }
+    return distributions, discrete_distributions
+if __name__ == '__main__':
+    p = argparse.ArgumentParser()
+    p.add_argument('--in_dir', type=Path, required=True, help='Directory with samples')
+    p.add_argument('--out_dir', type=str, required=True, help='Output directory')
+    p.add_argument('--n_samples', type=int, required=False, default=None, help='N samples per target')
+    p.add_argument('--reference_smiles', type=str, default=None, help='Path to the .npy file with reference SMILES (optional)')
+    p.add_argument('--crossdocked_dir', type=str, required=False, default=None, help='Crossdocked data dir for computing distances between distributions')
+    args = p.parse_args()
+    Path(args.out_dir).mkdir(parents=True, exist_ok=True)
+    print('Combining data')
+    data = []
+    for file_path in tqdm(Path(args.in_dir).glob('metrics_data_*.pkl')):
+        with open(file_path, 'rb') as f:
+            d = pickle.load(f)
+            if args.n_samples is not None:
+                d = d[:args.n_samples]
+            data += d
+    with open(Path(args.out_dir, 'metrics_data.pkl'), 'wb') as f:
+        pickle.dump(data, f)
+    print('Combining detailed metrics')
+    tables = []
+    for file_path in tqdm(Path(args.in_dir).glob('metrics_detailed_*.csv')):
+        table = pd.read_csv(file_path)
+        if args.n_samples is not None:
+            table = table.head(args.n_samples)
+        tables.append(table)
+    table_detailed = pd.concat(tables)
+    table_detailed.to_csv(Path(args.out_dir, 'metrics_detailed.csv'), index=False)
+    print('Computing aggregated metrics')
+    evaluator = FullEvaluator(gnina='gnina', reduce='reduce')
+    table_aggregated = aggregated_metrics(
+        table_detailed,
+        data_types=evaluator.dtypes,
+        validity_metric_name=VALIDITY_METRIC_NAME
+    )
+    if args.reference_smiles is not None:
+        reference_smiles = np.load(args.reference_smiles)
+        col_metrics = collection_metrics(
+            table=table_detailed,
+            reference_smiles=reference_smiles,
+            validity_metric_name=VALIDITY_METRIC_NAME,
+            exclude_evaluators=[],
+        )
+        table_aggregated = pd.concat([table_aggregated, col_metrics])
+    table_aggregated.to_csv(Path(args.out_dir, 'metrics_aggregated.csv'), index=False)
+    # Computing distributions
+    if args.crossdocked_dir is not None:
+        # Loading training data distributions
+        crossdocked_distributions = None
+        crossdocked_discrete_distributions = None
+        precomputed_distr_path = f'{args.crossdocked_dir}/crossdocked_distributions.pkl'
+        precomputed_discrete_distr_path = f'{args.crossdocked_dir}/crossdocked_discrete_distributions.pkl'
+        if os.path.exists(precomputed_distr_path) and os.path.exists(precomputed_discrete_distr_path):
+            # Use precomputed distributions in case they exist
+            with open(precomputed_distr_path, 'rb') as f:
+                crossdocked_distributions = pickle.load(f)
+            with open(precomputed_discrete_distr_path, 'rb') as f:
+                crossdocked_discrete_distributions = pickle.load(f)
+        else:
+            assert os.path.exists(f'{args.crossdocked_dir}/metrics_detailed.csv')
+            assert os.path.exists(f'{args.crossdocked_dir}/metrics_data.pkl')
+            crossdocked_distributions, crossdocked_discrete_distributions = prepare_baseline_data(
+                root_path=args.crossdocked_dir,
+                baseline_name='crossdocked'
+            )
+            # Save precomputed distributions for faster next runs
+            with open(precomputed_distr_path, 'wb') as f:
+                pickle.dump(crossdocked_distributions, f)
+            with open(precomputed_discrete_distr_path, 'wb') as f:
+                pickle.dump(crossdocked_discrete_distributions, f)
+        # Selecting top-5 most frequent atom types, bond types, angles and torsions
+        bonds = sorted([
+            (k, len(v)) for k, v in crossdocked_distributions.items()
+            if k.startswith('geometry.') and sum(s.isalpha() for s in k.split('.')[1]) == 2
+        ], key=lambda t: t[1], reverse=True)[:5]
+        top_5_bonds = [t[0] for t in bonds]
+        angles = sorted([
+            (k, len(v)) for k, v in crossdocked_distributions.items()
+            if k.startswith('geometry.') and sum(s.isalpha() for s in k.split('.')[1]) == 3
+        ], key=lambda t: t[1], reverse=True)[:5]
+        top_5_angles = [t[0] for t in angles]
+        # Loading distributions of samples
+        distributions, discrete_distributions = prepare_baseline_data(args.out_dir, 'samples')
+        # Computing distances between distributions
+        distances = {'method': 'method',}
+        relevant_columns = MEDCHEM_PROPS + DOCKING_PROPS + RELEVANT_INTERACTIONS + top_5_bonds + top_5_angles
+        for metric in distributions.keys():
+            if metric not in relevant_columns:
+                continue
+            ref = crossdocked_distributions.get(metric)
+            # cur = distributions.get(metric)
+            cur = [x for x in distributions.get(metric) if not pd.isna(x)]
+            if ref is not None and cur is not None and len(cur) > 0:
+                try:
+                    distance = wasserstein_distance(ref, cur)
+                except:
+                    from pdb import set_trace; set_trace()
+                num_ref = len(ref)
+                num_cur = len(cur)
+                distances[f'WD.{metric}'] = distance
+        for metric in crossdocked_discrete_distributions.keys():
+            ref = crossdocked_discrete_distributions.get(metric)
+            cur = discrete_distributions.get(metric)
+            if ref is not None and cur is not None:
+                distance = jensenshannon(p=ref, q=cur)
+                num_ref = len(ref)
+                num_cur = len(cur)
+                distances[f'JS.{metric}'] = distance
+        dist_table = pd.DataFrame([distances])
+        dist_table.to_csv(Path(args.out_dir, 'metrics_distances.csv'), index=False)

src/analysis/SA_Score/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ Files taken from: https://github.com/rdkit/rdkit/tree/master/Contrib/SA_Score

src/analysis/SA_Score/fpscores.pkl.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10dcef9340c873e7b987924461b0af5365eb8dd96be607203debe8ddf80c1e73
+size 3848394

src/analysis/SA_Score/sascorer.py ADDED Viewed

	@@ -0,0 +1,173 @@

+#
+# calculation of synthetic accessibility score as described in:
+#
+# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
+# Peter Ertl and Ansgar Schuffenhauer
+# Journal of Cheminformatics 1:8 (2009)
+# http://www.jcheminf.com/content/1/1/8
+#
+# several small modifications to the original paper are included
+# particularly slightly different formula for marocyclic penalty
+# and taking into account also molecule symmetry (fingerprint density)
+#
+# for a set of 10k diverse molecules the agreement between the original method
+# as implemented in PipelinePilot and this implementation is r2 = 0.97
+#
+# peter ertl & greg landrum, september 2013
+#
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
+import pickle
+import math
+from collections import defaultdict
+import os.path as op
+_fscores = None
+def readFragmentScores(name='fpscores'):
+    import gzip
+    global _fscores
+    # generate the full path filename:
+    if name == "fpscores":
+        name = op.join(op.dirname(__file__), name)
+    data = pickle.load(gzip.open('%s.pkl.gz' % name))
+    outDict = {}
+    for i in data:
+        for j in range(1, len(i)):
+            outDict[i[j]] = float(i[0])
+    _fscores = outDict
+def numBridgeheadsAndSpiro(mol, ri=None):
+    nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
+    nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
+    return nBridgehead, nSpiro
+def calculateScore(m):
+    if _fscores is None:
+        readFragmentScores()
+    # fragment score
+    fp = rdMolDescriptors.GetMorganFingerprint(m,
+                                               2)  # <- 2 is the *radius* of the circular fingerprint
+    fps = fp.GetNonzeroElements()
+    score1 = 0.
+    nf = 0
+    for bitId, v in fps.items():
+        nf += v
+        sfp = bitId
+        score1 += _fscores.get(sfp, -4) * v
+    score1 /= nf
+    # features score
+    nAtoms = m.GetNumAtoms()
+    nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
+    ri = m.GetRingInfo()
+    nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
+    nMacrocycles = 0
+    for x in ri.AtomRings():
+        if len(x) > 8:
+            nMacrocycles += 1
+    sizePenalty = nAtoms**1.005 - nAtoms
+    stereoPenalty = math.log10(nChiralCenters + 1)
+    spiroPenalty = math.log10(nSpiro + 1)
+    bridgePenalty = math.log10(nBridgeheads + 1)
+    macrocyclePenalty = 0.
+    # ---------------------------------------
+    # This differs from the paper, which defines:
+    #  macrocyclePenalty = math.log10(nMacrocycles+1)
+    # This form generates better results when 2 or more macrocycles are present
+    if nMacrocycles > 0:
+        macrocyclePenalty = math.log10(2)
+    score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
+    # correction for the fingerprint density
+    # not in the original publication, added in version 1.1
+    # to make highly symmetrical molecules easier to synthetise
+    score3 = 0.
+    if nAtoms > len(fps):
+        score3 = math.log(float(nAtoms) / len(fps)) * .5
+    sascore = score1 + score2 + score3
+    # need to transform "raw" value into scale between 1 and 10
+    min = -4.0
+    max = 2.5
+    sascore = 11. - (sascore - min + 1) / (max - min) * 9.
+    # smooth the 10-end
+    if sascore > 8.:
+        sascore = 8. + math.log(sascore + 1. - 9.)
+    if sascore > 10.:
+        sascore = 10.0
+    elif sascore < 1.:
+        sascore = 1.0
+    return sascore
+def processMols(mols):
+    print('smiles\tName\tsa_score')
+    for i, m in enumerate(mols):
+        if m is None:
+            continue
+        s = calculateScore(m)
+        smiles = Chem.MolToSmiles(m)
+        print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
+if __name__ == '__main__':
+    import sys
+    import time
+    t1 = time.time()
+    readFragmentScores("fpscores")
+    t2 = time.time()
+    suppl = Chem.SmilesMolSupplier(sys.argv[1])
+    t3 = time.time()
+    processMols(suppl)
+    t4 = time.time()
+    print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
+          file=sys.stderr)
+#
+#  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
+#  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
+#       products derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#

src/analysis/metrics.py ADDED Viewed

	@@ -0,0 +1,544 @@

+import subprocess
+import numpy as np
+import tempfile
+from pathlib import Path
+from tqdm import tqdm
+from rdkit import Chem, DataStructs
+from rdkit.Chem import AllChem
+from rdkit.Chem import Descriptors, Crippen, Lipinski, QED
+from rdkit.Chem import AtomKekulizeException, AtomValenceException, \
+    KekulizeException, MolSanitizeException
+from src.analysis.SA_Score.sascorer import calculateScore
+from src.utils import write_sdf_file
+from copy import deepcopy
+from pdb import set_trace
+class CategoricalDistribution:
+    EPS = 1e-10
+    def __init__(self, histogram_dict, mapping):
+        histogram = np.zeros(len(mapping))
+        for k, v in histogram_dict.items():
+            histogram[mapping[k]] = v
+        # Normalize histogram
+        self.p = histogram / histogram.sum()
+        self.mapping = deepcopy(mapping)
+    def kl_divergence(self, other_sample):
+        sample_histogram = np.zeros(len(self.mapping))
+        for x in other_sample:
+            # sample_histogram[self.mapping[x]] += 1
+            sample_histogram[x] += 1
+        # Normalize
+        q = sample_histogram / sample_histogram.sum()
+        return -np.sum(self.p * np.log(q / (self.p + self.EPS) + self.EPS))
+def check_mol(rdmol):
+    """
+    See also: https://www.rdkit.org/docs/RDKit_Book.html#molecular-sanitization
+    """
+    if rdmol is None:
+        return 'is_none'
+    _rdmol = Chem.Mol(rdmol)
+    try:
+        Chem.SanitizeMol(_rdmol)
+        return 'valid'
+    except ValueError as e:
+        assert isinstance(e, MolSanitizeException)
+        return type(e).__name__
+def validity_analysis(rdmol_list):
+    """
+    For explanations, see: https://www.rdkit.org/docs/RDKit_Book.html#molecular-sanitization
+    """
+    result = {
+        'AtomValenceException': 0,  # atoms in higher-than-allowed valence states
+        'AtomKekulizeException': 0,
+        'KekulizeException': 0,  # ring cannot be kekulized or aromatic bonds found outside of rings
+        'other': 0,
+        'valid': 0
+    }
+    for rdmol in rdmol_list:
+        flag = check_mol(rdmol)
+        try:
+            result[flag] += 1
+        except KeyError:
+            result['other'] += 1
+    assert sum(result.values()) == len(rdmol_list)
+    return result
+class MoleculeValidity:
+    def __init__(self, connectivity_thresh=1.0):
+        self.connectivity_thresh = connectivity_thresh
+    def compute_validity(self, generated):
+        """ generated: list of RDKit molecules. """
+        if len(generated) < 1:
+            return [], 0.0
+        # Return copies of the valid molecules
+        valid = [Chem.Mol(mol) for mol in generated if check_mol(mol) == 'valid']
+        return valid, len(valid) / len(generated)
+    def compute_connectivity(self, valid):
+        """
+        Consider molecule connected if its largest fragment contains at
+        least <self.connectivity_thresh * 100>% of all atoms.
+        :param valid: list of valid RDKit molecules
+        """
+        if len(valid) < 1:
+            return [], 0.0
+        for mol in valid:
+            Chem.SanitizeMol(mol)  # all molecules should be valid
+        connected = []
+        for mol in valid:
+            if mol.GetNumAtoms() < 1:
+                continue
+            try:
+                mol_frags = Chem.rdmolops.GetMolFrags(mol, asMols=True)
+            except MolSanitizeException as e:
+                print('Error while computing connectivity:', e)
+                continue
+            largest_frag = max(mol_frags, default=mol, key=lambda m: m.GetNumAtoms())
+            if largest_frag.GetNumAtoms() / mol.GetNumAtoms() >= self.connectivity_thresh:
+                connected.append(largest_frag)
+        return connected, len(connected) / len(valid)
+    def __call__(self, rdmols, verbose=False):
+        """
+        :param rdmols: list of RDKit molecules
+        """
+        results = {}
+        results['n_total'] = len(rdmols)
+        valid, validity = self.compute_validity(rdmols)
+        results['n_valid'] = len(valid)
+        results['validity'] = validity
+        connected, connectivity = self.compute_connectivity(valid)
+        results['n_connected'] = len(connected)
+        results['connectivity'] = connectivity
+        results['valid_and_connected'] = results['n_connected'] / results['n_total']
+        if verbose:
+            print(f"Validity over {results['n_total']} molecules: {validity * 100 :.2f}%")
+            print(f"Connectivity over {results['n_valid']} valid molecules: {connectivity * 100 :.2f}%")
+        return results
+class MolecularMetrics:
+    def __init__(self, connectivity_thresh=1.0):
+        self.connectivity_thresh = connectivity_thresh
+    @staticmethod
+    def is_valid(rdmol):
+        if rdmol.GetNumAtoms() < 1:
+            return False
+        _mol = Chem.Mol(rdmol)
+        try:
+            Chem.SanitizeMol(_mol)
+        except ValueError:
+            return False
+        return True
+    def is_connected(self, rdmol):
+        if rdmol.GetNumAtoms() < 1:
+            return False
+        mol_frags = Chem.rdmolops.GetMolFrags(rdmol, asMols=True)
+        largest_frag = max(mol_frags, default=rdmol, key=lambda m: m.GetNumAtoms())
+        if largest_frag.GetNumAtoms() / rdmol.GetNumAtoms() >= self.connectivity_thresh:
+            return True
+        else:
+            return False
+    @staticmethod
+    def calculate_qed(rdmol):
+        return QED.qed(rdmol)
+    @staticmethod
+    def calculate_sa(rdmol):
+        sa = calculateScore(rdmol)
+        return sa
+    @staticmethod
+    def calculate_logp(rdmol):
+        return Crippen.MolLogP(rdmol)
+    @staticmethod
+    def calculate_lipinski(rdmol):
+        rule_1 = Descriptors.ExactMolWt(rdmol) < 500
+        rule_2 = Lipinski.NumHDonors(rdmol) <= 5
+        rule_3 = Lipinski.NumHAcceptors(rdmol) <= 10
+        rule_4 = (logp := Crippen.MolLogP(rdmol) >= -2) & (logp <= 5)
+        rule_5 = Chem.rdMolDescriptors.CalcNumRotatableBonds(rdmol) <= 10
+        return np.sum([int(a) for a in [rule_1, rule_2, rule_3, rule_4, rule_5]])
+    def __call__(self, rdmol):
+        valid = self.is_valid(rdmol)
+        if valid:
+            Chem.SanitizeMol(rdmol)
+        connected = None if not valid else self.is_connected(rdmol)
+        qed = None if not valid else self.calculate_qed(rdmol)
+        sa = None if not valid else self.calculate_sa(rdmol)
+        logp = None if not valid else self.calculate_logp(rdmol)
+        lipinski = None if not valid else self.calculate_lipinski(rdmol)
+        return {
+            'valid': valid,
+            'connected': connected,
+            'qed': qed,
+            'sa': sa,
+            'logp': logp,
+            'lipinski': lipinski
+        }
+class Diversity:
+    @staticmethod
+    def similarity(fp1, fp2):
+        return DataStructs.TanimotoSimilarity(fp1, fp2)
+    def get_fingerprint(self, mol):
+        # fp = AllChem.GetMorganFingerprintAsBitVect(
+        #     mol, 2, nBits=2048, useChirality=False)
+        fp = Chem.RDKFingerprint(mol)
+        return fp
+    def __call__(self, pocket_mols):
+        if len(pocket_mols) < 2:
+            return 0.0
+        pocket_fps = [self.get_fingerprint(m) for m in pocket_mols]
+        div = 0
+        total = 0
+        for i in range(len(pocket_fps)):
+            for j in range(i + 1, len(pocket_fps)):
+                div += 1 - self.similarity(pocket_fps[i], pocket_fps[j])
+                total += 1
+        return div / total
+class MoleculeUniqueness:
+    def __call__(self, smiles_list):
+        """ smiles_list: list of SMILES strings. """
+        if len(smiles_list) < 1:
+            return 0.0
+        return len(set(smiles_list)) / len(smiles_list)
+class MoleculeNovelty:
+    def __init__(self, reference_smiles):
+        """
+        :param reference_smiles: list of SMILES strings
+        """
+        self.reference_smiles = set(reference_smiles)
+    def __call__(self, smiles_list):
+        if len(smiles_list) < 1:
+            return 0.0
+        novel = [smi for smi in smiles_list if smi not in self.reference_smiles]
+        return len(novel) / len(smiles_list)
+class MolecularProperties:
+    @staticmethod
+    def calculate_qed(rdmol):
+        return QED.qed(rdmol)
+    @staticmethod
+    def calculate_sa(rdmol):
+        sa = calculateScore(rdmol)
+        # return round((10 - sa) / 9, 2)  # from pocket2mol
+        return sa
+    @staticmethod
+    def calculate_logp(rdmol):
+        return Crippen.MolLogP(rdmol)
+    @staticmethod
+    def calculate_lipinski(rdmol):
+        rule_1 = Descriptors.ExactMolWt(rdmol) < 500
+        rule_2 = Lipinski.NumHDonors(rdmol) <= 5
+        rule_3 = Lipinski.NumHAcceptors(rdmol) <= 10
+        rule_4 = (logp := Crippen.MolLogP(rdmol) >= -2) & (logp <= 5)
+        rule_5 = Chem.rdMolDescriptors.CalcNumRotatableBonds(rdmol) <= 10
+        return np.sum([int(a) for a in [rule_1, rule_2, rule_3, rule_4, rule_5]])
+    @classmethod
+    def calculate_diversity(cls, pocket_mols):
+        if len(pocket_mols) < 2:
+            return 0.0
+        div = 0
+        total = 0
+        for i in range(len(pocket_mols)):
+            for j in range(i + 1, len(pocket_mols)):
+                div += 1 - cls.similarity(pocket_mols[i], pocket_mols[j])
+                total += 1
+        return div / total
+    @staticmethod
+    def similarity(mol_a, mol_b):
+        # fp1 = AllChem.GetMorganFingerprintAsBitVect(
+        #     mol_a, 2, nBits=2048, useChirality=False)
+        # fp2 = AllChem.GetMorganFingerprintAsBitVect(
+        #     mol_b, 2, nBits=2048, useChirality=False)
+        fp1 = Chem.RDKFingerprint(mol_a)
+        fp2 = Chem.RDKFingerprint(mol_b)
+        return DataStructs.TanimotoSimilarity(fp1, fp2)
+    def evaluate_pockets(self, pocket_rdmols, verbose=False):
+        """
+        Run full evaluation
+        Args:
+            pocket_rdmols: list of lists, the inner list contains all RDKit
+                molecules generated for a pocket
+        Returns:
+            QED, SA, LogP, Lipinski (per molecule), and Diversity (per pocket)
+        """
+        for pocket in pocket_rdmols:
+            for mol in pocket:
+                Chem.SanitizeMol(mol)  # only evaluate valid molecules
+        all_qed = []
+        all_sa = []
+        all_logp = []
+        all_lipinski = []
+        per_pocket_diversity = []
+        for pocket in tqdm(pocket_rdmols):
+            all_qed.append([self.calculate_qed(mol) for mol in pocket])
+            all_sa.append([self.calculate_sa(mol) for mol in pocket])
+            all_logp.append([self.calculate_logp(mol) for mol in pocket])
+            all_lipinski.append([self.calculate_lipinski(mol) for mol in pocket])
+            per_pocket_diversity.append(self.calculate_diversity(pocket))
+        qed_flattened = [x for px in all_qed for x in px]
+        sa_flattened = [x for px in all_sa for x in px]
+        logp_flattened = [x for px in all_logp for x in px]
+        lipinski_flattened = [x for px in all_lipinski for x in px]
+        if verbose:
+            print(f"{sum([len(p) for p in pocket_rdmols])} molecules from "
+                  f"{len(pocket_rdmols)} pockets evaluated.")
+            print(f"QED: {np.mean(qed_flattened):.3f} \pm {np.std(qed_flattened):.2f}")
+            print(f"SA: {np.mean(sa_flattened):.3f} \pm {np.std(sa_flattened):.2f}")
+            print(f"LogP: {np.mean(logp_flattened):.3f} \pm {np.std(logp_flattened):.2f}")
+            print(f"Lipinski: {np.mean(lipinski_flattened):.3f} \pm {np.std(lipinski_flattened):.2f}")
+            print(f"Diversity: {np.mean(per_pocket_diversity):.3f} \pm {np.std(per_pocket_diversity):.2f}")
+        return all_qed, all_sa, all_logp, all_lipinski, per_pocket_diversity
+    def __call__(self, rdmols):
+        """
+        Run full evaluation and return mean of each property
+        Args:
+            rdmols: list of RDKit molecules
+        Returns:
+            Dictionary with mean QED, SA, LogP, Lipinski, and Diversity values
+        """
+        if len(rdmols) < 1:
+            return {'QED': 0.0, 'SA': 0.0, 'LogP': 0.0, 'Lipinski': 0.0,
+                    'Diversity': 0.0}
+        _rdmols = []
+        for mol in rdmols:
+            try:
+                Chem.SanitizeMol(mol)  # only evaluate valid molecules
+                _rdmols.append(mol)
+            except ValueError as e:
+                print("Tried to analyze invalid molecule")
+        rdmols = _rdmols
+        qed = np.mean([self.calculate_qed(mol) for mol in rdmols])
+        sa = np.mean([self.calculate_sa(mol) for mol in rdmols])
+        logp = np.mean([self.calculate_logp(mol) for mol in rdmols])
+        lipinski = np.mean([self.calculate_lipinski(mol) for mol in rdmols])
+        diversity = self.calculate_diversity(rdmols)
+        return {'QED': qed, 'SA': sa, 'LogP': logp, 'Lipinski': lipinski,
+                'Diversity': diversity}
+def compute_gnina_scores(ligands, receptors, gnina):
+    metrics = ['minimizedAffinity', 'minimizedRMSD', 'CNNscore', 'CNNaffinity', 'CNN_VS', 'CNNaffinity_variance']
+    out = {m: [] for m in metrics}
+    with tempfile.TemporaryDirectory() as tmpdir:
+        for ligand, receptor in zip(tqdm(ligands, desc='Docking'), receptors):
+            in_ligand_path = Path(tmpdir, 'in_ligand.sdf')
+            out_ligand_path = Path(tmpdir, 'out_ligand.sdf')
+            receptor_path = Path(tmpdir, 'receptor.pdb')
+            write_sdf_file(in_ligand_path, [ligand], catch_errors=True)
+            Chem.MolToPDBFile(receptor, str(receptor_path))
+            if (
+                    (not in_ligand_path.exists()) or
+                    (not receptor_path.exists()) or
+                    in_ligand_path.read_text() == '' or
+                    receptor_path.read_text() == ''
+            ):
+                continue
+            cmd = (
+                f'{gnina} -r {receptor_path} -l {in_ligand_path} '
+                f'--minimize --seed 42 -o {out_ligand_path} --no_gpu 1> /dev/null'
+            )
+            subprocess.run(cmd, shell=True)
+            if not out_ligand_path.exists() or out_ligand_path.read_text() == '':
+                continue
+            mol = Chem.SDMolSupplier(str(out_ligand_path), sanitize=False)[0]
+            for metric in metrics:
+                out[metric].append(float(mol.GetProp(metric)))
+    for metric in metrics:
+        out[metric] = sum(out[metric]) / len(out[metric]) if len(out[metric]) > 0 else 0
+    return out
+def legacy_clash_score(rdmol1, rdmol2=None, margin=0.75):
+    """
+    Computes a clash score as the number of atoms that have at least one
+    clash divided by the number of atoms in the molecule.
+    INTERMOLECULAR CLASH SCORE
+    If rdmol2 is provided, the score is the percentage of atoms in rdmol1
+    that have at least one clash with rdmol2.
+    We define a clash if two atoms are closer than "margin times the sum of
+    their van der Waals radii".
+    INTRAMOLECULAR CLASH SCORE
+    If rdmol2 is not provided, the score is the percentage of atoms in rdmol1
+    that have at least one clash with other atoms in rdmol1.
+    In this case, a clash is defined by margin times the atoms' smallest
+    covalent radii (among single, double and triple bond radii). This is done
+    so that this function is applicable even if no connectivity information is
+    available.
+    """
+    # source: https://en.wikipedia.org/wiki/Van_der_Waals_radius
+    vdw_radii = {'N': 1.55, 'O': 1.52, 'C': 1.70, 'H': 1.10, 'S': 1.80, 'P': 1.80,
+                 'Se': 1.90, 'K': 2.75, 'Na': 2.27, 'Mg': 1.73, 'Zn': 1.39, 'B': 1.92,
+                 'Br': 1.85, 'Cl': 1.75, 'I': 1.98, 'F': 1.47}
+    # https://en.wikipedia.org/wiki/Covalent_radius#Radii_for_multiple_bonds
+    covalent_radii = {'H': 0.32, 'C': 0.60, 'N': 0.54, 'O': 0.53, 'F': 0.53, 'B': 0.73,
+                      'Al': 1.11, 'Si': 1.02, 'P': 0.94, 'S': 0.94, 'Cl': 0.93, 'As': 1.06,
+                      'Br': 1.09, 'I': 1.25, 'Hg': 1.33, 'Bi': 1.35}
+    coord1 = rdmol1.GetConformer().GetPositions()
+    if rdmol2 is None:
+        radii1 = np.array([covalent_radii[a.GetSymbol()] for a in rdmol1.GetAtoms()])
+        assert coord1.shape[0] == radii1.shape[0]
+        dist = np.sqrt(np.sum((coord1[:, None, :] - coord1[None, :, :]) ** 2, axis=-1))
+        np.fill_diagonal(dist, np.inf)
+        clashes = dist < margin * (radii1[:, None] + radii1[None, :])
+    else:
+        coord2 = rdmol2.GetConformer().GetPositions()
+        radii1 = np.array([vdw_radii[a.GetSymbol()] for a in rdmol1.GetAtoms()])
+        assert coord1.shape[0] == radii1.shape[0]
+        radii2 = np.array([vdw_radii[a.GetSymbol()] for a in rdmol2.GetAtoms()])
+        assert coord2.shape[0] == radii2.shape[0]
+        dist = np.sqrt(np.sum((coord1[:, None, :] - coord2[None, :, :]) ** 2, axis=-1))
+        clashes = dist < margin * (radii1[:, None] + radii2[None, :])
+    clashes = np.any(clashes, axis=1)
+    return np.mean(clashes)
+def clash_score(rdmol1, rdmol2=None, margin=0.75, ignore={'H'}):
+    """
+    Computes a clash score as the number of atoms that have at least one
+    clash divided by the number of atoms in the molecule.
+    INTERMOLECULAR CLASH SCORE
+    If rdmol2 is provided, the score is the percentage of atoms in rdmol1
+    that have at least one clash with rdmol2.
+    We define a clash if two atoms are closer than "margin times the sum of
+    their van der Waals radii".
+    INTRAMOLECULAR CLASH SCORE
+    If rdmol2 is not provided, the score is the percentage of atoms in rdmol1
+    that have at least one clash with other atoms in rdmol1.
+    In this case, a clash is defined by margin times the atoms' smallest
+    covalent radii (among single, double and triple bond radii). This is done
+    so that this function is applicable even if no connectivity information is
+    available.
+    """
+    intramolecular = rdmol2 is None
+    _periodic_table = AllChem.GetPeriodicTable()
+    def _coord_and_radii(rdmol):
+        coord = rdmol.GetConformer().GetPositions()
+        radii = np.array([_get_radius(a.GetSymbol()) for a in rdmol.GetAtoms()])
+        mask = np.array([a.GetSymbol() not in ignore for a in rdmol.GetAtoms()])
+        coord = coord[mask]
+        radii = radii[mask]
+        assert coord.shape[0] == radii.shape[0]
+        return coord, radii
+    # INTRAMOLECULAR CLASH SCORE
+    if intramolecular:
+        rdmol2 = rdmol1
+        _get_radius = _periodic_table.GetRcovalent  # covalent radii
+    # INTERMOLECULAR CLASH SCORE
+    else:
+        _get_radius = _periodic_table.GetRvdw  # vdW radii
+    coord1, radii1 = _coord_and_radii(rdmol1)
+    coord2, radii2 = _coord_and_radii(rdmol2)
+    dist = np.sqrt(np.sum((coord1[:, None, :] - coord2[None, :, :]) ** 2, axis=-1))
+    if intramolecular:
+        np.fill_diagonal(dist, np.inf)
+    clashes = dist < margin * (radii1[:, None] + radii2[None, :])
+    clashes = np.any(clashes, axis=1)
+    return np.mean(clashes)

src/analysis/visualization_utils.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import warnings
+import torch
+from rdkit import Chem
+from rdkit.Chem import Draw, AllChem
+from rdkit.Chem import SanitizeFlags
+from src.analysis.metrics import check_mol
+from src import utils
+from src.data.molecule_builder import build_molecule
+from src.data.misc import protein_letters_1to3
+# def pocket_to_rdkit(pocket, pocket_representation, atom_encoder=None,
+#                     atom_decoder=None, aa_decoder=None, residue_decoder=None,
+#                     aa_atom_index=None):
+#
+#     rdpockets = []
+#     for i in torch.unique(pocket['mask']):
+#
+#         node_coord = pocket['x'][pocket['mask'] == i]
+#         h = pocket['one_hot'][pocket['mask'] == i]
+#
+#         if pocket_representation == 'side_chain_bead':
+#             coord = node_coord
+#
+#             node_types = [residue_decoder[b] for b in h[:, -len(residue_decoder):].argmax(-1)]
+#             atom_types = ['C' if r == 'CA' else 'F' for r in node_types]
+#
+#         elif pocket_representation == 'CA+':
+#             aa_types = [aa_decoder[b] for b in h.argmax(-1)]
+#             side_chain_vec = pocket['v'][pocket['mask'] == i]
+#
+#             coord = []
+#             atom_types = []
+#             for xyz, aa, vec in zip(node_coord, aa_types, side_chain_vec):
+#                 # C_alpha
+#                 coord.append(xyz)
+#                 atom_types.append('C')
+#
+#                 # all other atoms
+#                 for atom_name, idx in aa_atom_index[aa].items():
+#                     coord.append(xyz + vec[idx])
+#                     atom_types.append(atom_name[0])
+#
+#             coord = torch.stack(coord, dim=0)
+#
+#         else:
+#             raise NotImplementedError(f"{pocket_representation} residue representation not supported")
+#
+#         atom_types = torch.tensor([atom_encoder[a] for a in atom_types])
+#         rdpockets.append(build_molecule(coord, atom_types, atom_decoder=atom_decoder))
+#
+#     return rdpockets
+def pocket_to_rdkit(pocket, pocket_representation, atom_encoder=None,
+                    atom_decoder=None, aa_decoder=None, residue_decoder=None,
+                    aa_atom_index=None):
+    rdpockets = []
+    for i in torch.unique(pocket['mask']):
+        node_coord = pocket['x'][pocket['mask'] == i]
+        h = pocket['one_hot'][pocket['mask'] == i]
+        atom_mask = pocket['atom_mask'][pocket['mask'] == i]
+        pdb_infos = []
+        if pocket_representation == 'side_chain_bead':
+            coord = node_coord
+            node_types = [residue_decoder[b] for b in h[:, -len(residue_decoder):].argmax(-1)]
+            atom_types = ['C' if r == 'CA' else 'F' for r in node_types]
+        elif pocket_representation == 'CA+':
+            aa_types = [aa_decoder[b] for b in h.argmax(-1)]
+            side_chain_vec = pocket['v'][pocket['mask'] == i]
+            coord = []
+            atom_types = []
+            for resi, (xyz, aa, vec, am) in enumerate(zip(node_coord, aa_types, side_chain_vec, atom_mask)):
+                # CA not treated differently with updated atom dictionary
+                for atom_name, idx in aa_atom_index[aa].items():
+                    if ~am[idx]:
+                        warnings.warn(f"Missing atom {atom_name} in {aa}:{resi}")
+                        continue
+                    coord.append(xyz + vec[idx])
+                    atom_types.append(atom_name[0])
+                    info = Chem.AtomPDBResidueInfo()
+                    # info.SetChainId('A')
+                    info.SetResidueName(protein_letters_1to3[aa])
+                    info.SetResidueNumber(resi + 1)
+                    info.SetOccupancy(1.0)
+                    info.SetTempFactor(0.0)
+                    info.SetName(f' {atom_name:<3}')
+                    pdb_infos.append(info)
+            coord = torch.stack(coord, dim=0)
+        else:
+            raise NotImplementedError(f"{pocket_representation} residue representation not supported")
+        atom_types = torch.tensor([atom_encoder[a] for a in atom_types])
+        rdmol = build_molecule(coord, atom_types, atom_decoder=atom_decoder)
+        if len(pdb_infos) == len(rdmol.GetAtoms()):
+            for a, info in zip(rdmol.GetAtoms(), pdb_infos):
+                a.SetPDBResidueInfo(info)
+        rdpockets.append(rdmol)
+    return rdpockets
+def mols_to_pdbfile(rdmols, filename, flavor=0):
+    pdb_str = ""
+    for i, mol in enumerate(rdmols):
+        pdb_str += f"MODEL{i + 1:>9}\n"
+        block = Chem.MolToPDBBlock(mol, flavor=flavor)
+        block = "\n".join(block.split("\n")[:-2])  # remove END
+        pdb_str += block + "\n"
+        pdb_str += f"ENDMDL\n"
+    pdb_str += f"END\n"
+    with open(filename, 'w') as f:
+        f.write(pdb_str)
+    return pdb_str
+def mol_as_pdb(rdmol, filename=None, bfactor=None):
+    _rdmol = Chem.Mol(rdmol)  # copy
+    for a in _rdmol.GetAtoms():
+        a.SetIsAromatic(False)
+    for b in _rdmol.GetBonds():
+        b.SetIsAromatic(False)
+    if bfactor is not None:
+        for a in _rdmol.GetAtoms():
+            val = a.GetPropsAsDict()[bfactor]
+            info = Chem.AtomPDBResidueInfo()
+            info.SetResidueName('UNL')
+            info.SetResidueNumber(1)
+            info.SetName(f' {a.GetSymbol():<3}')
+            info.SetIsHeteroAtom(True)
+            info.SetOccupancy(1.0)
+            info.SetTempFactor(val)
+            a.SetPDBResidueInfo(info)
+    pdb_str = Chem.MolToPDBBlock(_rdmol)
+    if filename is not None:
+        with open(filename, 'w') as f:
+            f.write(pdb_str)
+    return pdb_str
+def draw_grid(molecules, mols_per_row=5, fig_size=(200, 200),
+              label=check_mol,
+              highlight_atom=lambda atom: False,
+              highlight_bond=lambda bond: False):
+    draw_mols = []
+    marked_atoms = []
+    marked_bonds = []
+    for mol in molecules:
+        draw_mol = Chem.Mol(mol)  # copy
+        Chem.SanitizeMol(draw_mol, sanitizeOps=SanitizeFlags.SANITIZE_NONE)
+        AllChem.Compute2DCoords(draw_mol)
+        draw_mol = Draw.rdMolDraw2D.PrepareMolForDrawing(draw_mol,
+                                                         kekulize=False)
+        draw_mols.append(draw_mol)
+        marked_atoms.append([a.GetIdx() for a in draw_mol.GetAtoms() if highlight_atom(a)])
+        marked_bonds.append([b.GetIdx() for b in draw_mol.GetBonds() if highlight_bond(b)])
+    drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
+    drawOptions.prepareMolsBeforeDrawing = False
+    drawOptions.highlightBondWidthMultiplier = 20
+    return Draw.MolsToGridImage(draw_mols,
+                                molsPerRow=mols_per_row,
+                                subImgSize=fig_size,
+                                drawOptions=drawOptions,
+                                highlightAtomLists=marked_atoms,
+                                highlightBondLists=marked_bonds,
+                                legends=[f'[{i}] {label(mol)}' for
+                                         i, mol in enumerate(draw_mols)])

src/constants.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import os
+from rdkit import Chem
+import torch
+import numpy as np
+# ------------------------------------------------------------------------------
+# Computational
+# ------------------------------------------------------------------------------
+FLOAT_TYPE = torch.float32
+INT_TYPE = torch.int64
+# ------------------------------------------------------------------------------
+# Type encoding/decoding
+# ------------------------------------------------------------------------------
+atom_dict = os.environ.get('ATOM_DICT')
+if atom_dict == 'simple':
+    atom_encoder = {'C': 0, 'N': 1, 'O': 2, 'S': 3, 'B': 4, 'Br': 5, 'Cl': 6, 'P': 7, 'I': 8, 'F': 9, 'NOATOM': 10}
+    atom_decoder = ['C', 'N', 'O', 'S', 'B', 'Br', 'Cl', 'P', 'I', 'F', 'NOATOM']
+else:
+    atom_encoder = {'C': 0, 'N': 1, 'O': 2, 'S': 3, 'B': 4, 'Br': 5, 'Cl': 6, 'P': 7, 'I': 8, 'F': 9, 'NH': 10, 'N+': 11, 'O-': 12, 'NOATOM': 13}
+    atom_decoder = ['C', 'N', 'O', 'S', 'B', 'Br', 'Cl', 'P', 'I', 'F', 'NH', 'N+', 'O-', 'NOATOM']
+bond_encoder = {"NOBOND": 0, "SINGLE": 1, "DOUBLE": 2, "TRIPLE": 3, 'AROMATIC': 4}
+bond_decoder = [None, Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
+aa_encoder = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19}
+aa_decoder = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
+residue_encoder = {'CA': 0, 'SS': 1}
+residue_decoder = ['CA', 'SS']
+residue_bond_encoder = {'CA-CA': 0, 'CA-SS': 1, 'NOBOND': 2}
+residue_bond_decoder = ['CA-CA', 'CA-SS', None]
+# aa_atom_index = {
+#     'A': {'N': 0, 'C': 1, 'O': 2, 'CB': 3},
+#     'C': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'SG': 4},
+#     'D': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'OD1': 5, 'OD2': 6},
+#     'E': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5, 'OE1': 6, 'OE2': 7},
+#     'F': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD1': 5, 'CD2': 6, 'CE1': 7, 'CE2': 8, 'CZ': 9},
+#     'G': {'N': 0, 'C': 1, 'O': 2},
+#     'H': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'ND1': 5, 'CD2': 6, 'CE1': 7, 'NE2': 8},
+#     'I': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG1': 4, 'CG2': 5, 'CD1': 6},
+#     'K': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5, 'CE': 6, 'NZ': 7},
+#     'L': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD1': 5, 'CD2': 6},
+#     'M': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'SD': 5, 'CE': 6},
+#     'N': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'OD1': 5, 'ND2': 6},
+#     'P': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5},
+#     'Q': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5, 'OE1': 6, 'NE2': 7},
+#     'R': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD': 5, 'NE': 6, 'CZ': 7, 'NH1': 8, 'NH2': 9},
+#     'S': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'OG': 4},
+#     'T': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'OG1': 4, 'CG2': 5},
+#     'V': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG1': 4, 'CG2': 5},
+#     'W': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD1': 5, 'CD2': 6, 'NE1': 7, 'CE2': 8, 'CE3': 9, 'CZ2': 10, 'CZ3': 11, 'CH2': 12},
+#     'Y': {'N': 0, 'C': 1, 'O': 2, 'CB': 3, 'CG': 4, 'CD1': 5, 'CD2': 6, 'CE1': 7, 'CE2': 8, 'CZ': 9, 'OH': 10},
+# }
+aa_atom_index = {
+    'A': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4},
+    'C': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'SG': 5},
+    'D': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'OD1': 6, 'OD2': 7},
+    'E': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6, 'OE1': 7, 'OE2': 8},
+    'F': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD1': 6, 'CD2': 7, 'CE1': 8, 'CE2': 9, 'CZ': 10},
+    'G': {'N': 0, 'CA': 1, 'C': 2, 'O': 3},
+    'H': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'ND1': 6, 'CD2': 7, 'CE1': 8, 'NE2': 9},
+    'I': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG1': 5, 'CG2': 6, 'CD1': 7},
+    'K': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6, 'CE': 7, 'NZ': 8},
+    'L': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD1': 6, 'CD2': 7},
+    'M': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'SD': 6, 'CE': 7},
+    'N': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'OD1': 6, 'ND2': 7},
+    'P': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6},
+    'Q': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6, 'OE1': 7, 'NE2': 8},
+    'R': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD': 6, 'NE': 7, 'CZ': 8, 'NH1': 9, 'NH2': 10},
+    'S': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'OG': 5},
+    'T': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'OG1': 5, 'CG2': 6},
+    'V': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG1': 5, 'CG2': 6},
+    'W': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD1': 6, 'CD2': 7, 'NE1': 8, 'CE2': 9, 'CE3': 10, 'CZ2': 11, 'CZ3': 12, 'CH2': 13},
+    'Y': {'N': 0, 'CA': 1, 'C': 2, 'O': 3, 'CB': 4, 'CG': 5, 'CD1': 6, 'CD2': 7, 'CE1': 8, 'CE2': 9, 'CZ': 10, 'OH': 11},
+}
+# ------------------------------------------------------------------------------
+# NERF
+# ------------------------------------------------------------------------------
+# indicates whether atom exists
+aa_atom_mask = {
+    'A': [True, True, True, True, True, False, False, False, False, False, False, False, False, False],
+    'C': [True, True, True, True, True, True, False, False, False, False, False, False, False, False],
+    'D': [True, True, True, True, True, True, True, True, False, False, False, False, False, False],
+    'E': [True, True, True, True, True, True, True, True, True, False, False, False, False, False],
+    'F': [True, True, True, True, True, True, True, True, True, True, True, False, False, False],
+    'G': [True, True, True, True, False, False, False, False, False, False, False, False, False, False],
+    'H': [True, True, True, True, True, True, True, True, True, True, False, False, False, False],
+    'I': [True, True, True, True, True, True, True, True, False, False, False, False, False, False],
+    'K': [True, True, True, True, True, True, True, True, True, False, False, False, False, False],
+    'L': [True, True, True, True, True, True, True, True, False, False, False, False, False, False],
+    'M': [True, True, True, True, True, True, True, True, False, False, False, False, False, False],
+    'N': [True, True, True, True, True, True, True, True, False, False, False, False, False, False],
+    'P': [True, True, True, True, True, True, True, False, False, False, False, False, False, False],
+    'Q': [True, True, True, True, True, True, True, True, True, False, False, False, False, False],
+    'R': [True, True, True, True, True, True, True, True, True, True, True, False, False, False],
+    'S': [True, True, True, True, True, True, False, False, False, False, False, False, False, False],
+    'T': [True, True, True, True, True, True, True, False, False, False, False, False, False, False],
+    'V': [True, True, True, True, True, True, True, False, False, False, False, False, False, False],
+    'W': [True, True, True, True, True, True, True, True, True, True, True, True, True, True],
+    'Y': [True, True, True, True, True, True, True, True, True, True, True, True, False, False],
+}
+# (14, 3) index tensor with atom indices of atoms a, b and c for NERF reconstruction
+# in principle, columns 1 and 2 can be inferred from column one (immediate predecessor) alone
+aa_nerf_indices = {
+    'A': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'C': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'D': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'E': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [6, 5, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'F': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [6, 5, 4], [7, 5, 4], [8, 6, 5], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'G': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'H': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [6, 5, 4], [7, 5, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'I': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [4, 1, 0], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'K': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [7, 6, 5], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'L': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'M': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'N': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'P': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'Q': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [6, 5, 4], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'R': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [6, 5, 4], [7, 6, 5], [8, 7, 6], [8, 7, 6], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'S': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'T': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [4, 1, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'V': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [4, 1, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]],
+    'W': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [6, 5, 4], [7, 5, 4], [7, 5, 4], [9, 7, 5], [10, 7, 5], [11, 9, 7]],
+    'Y': [[0, 0, 0], [0, 0, 0], [1, 0, 0], [2, 1, 0], [1, 0, 0], [4, 1, 0], [5, 4, 1], [5, 4, 1], [6, 5, 4], [7, 5, 4], [8, 6, 5], [10, 8, 6], [0, 0, 0], [0, 0, 0]],
+}
+# unique id for each rotatable bond (0=chi1, 1=chi, ...)
+aa_bond_to_chi = {
+    'A': {},
+    'C': {('CA', 'CB'): 0},
+    'D': {('CA', 'CB'): 0, ('CB', 'CG'): 1},
+    'E': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'CD'): 2},
+    'F': {('CA', 'CB'): 0, ('CB', 'CG'): 1},
+    'G': {},
+    'H': {('CA', 'CB'): 0, ('CB', 'CG'): 1},
+    'I': {('CA', 'CB'): 0, ('CB', 'CG2'): 1},
+    'K': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'CD'): 2, ('CD', 'CE'): 3},
+    'L': {('CA', 'CB'): 0, ('CB', 'CG'): 1},
+    'M': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'SD'): 2},
+    'N': {('CA', 'CB'): 0, ('CB', 'CG'): 1},
+    'P': {},
+    'Q': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'CD'): 2},
+    'R': {('CA', 'CB'): 0, ('CB', 'CG'): 1, ('CG', 'CD'): 2, ('CD', 'NE'): 3, ('NE', 'CZ'): 4},
+    'S': {('CA', 'CB'): 0},
+    'T': {('CA', 'CB'): 0},
+    'V': {('CA', 'CB'): 0},
+    'W': {('CA', 'CB'): 0, ('CB', 'CG'): 1},
+    'Y': {('CA', 'CB'): 0, ('CB', 'CG'): 1},
+}
+# index between 0 and 4 to retrieve chi angles, -1 means not a rotatable bond
+aa_chi_indices = {
+    'A': [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+    'C': [-1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1],
+    'D': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1],
+    'E': [-1, -1, -1, -1, -1, 0, 1, 2, 2, -1, -1, -1, -1, -1],
+    'F': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1],
+    'G': [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+    'H': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1],
+    'I': [-1, -1, -1, -1, -1, 0, 0, 1, -1, -1, -1, -1, -1, -1],
+    'K': [-1, -1, -1, -1, -1, 0, 1, 2, 3, -1, -1, -1, -1, -1],
+    'L': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1],
+    'M': [-1, -1, -1, -1, -1, 0, 1, 2, -1, -1, -1, -1, -1, -1],
+    'N': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1],
+    'P': [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
+    'Q': [-1, -1, -1, -1, -1, 0, 1, 2, 2, -1, -1, -1, -1, -1],
+    'R': [-1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 4, -1, -1, -1],
+    'S': [-1, -1, -1, -1, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1],
+    'T': [-1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -1],
+    'V': [-1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, -1],
+    'W': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1],
+    'Y': [-1, -1, -1, -1, -1, 0, 1, 1, -1, -1, -1, -1, -1, -1],
+}
+# key: chi index (0=chi1, 1=chi, ...); value: index of atom that defines the chi angle (together with its three predecessors)
+aa_chi_anchor_atom = {
+    'A': {},
+    'C': {0: 5},
+    'D': {0: 5, 1: 6},
+    'E': {0: 5, 1: 6, 2: 7},
+    'F': {0: 5, 1: 6},
+    'G': {},
+    'H': {0: 5, 1: 6},
+    'I': {0: 5, 1: 7},
+    'K': {0: 5, 1: 6, 2: 7, 3: 8},
+    'L': {0: 5, 1: 6},
+    'M': {0: 5, 1: 6, 2: 7},
+    'N': {0: 5, 1: 6},
+    'P': {},
+    'Q': {0: 5, 1: 6, 2: 7},
+    'R': {0: 5, 1: 6, 2: 7, 3: 8, 4: 9},
+    'S': {0: 5},
+    'T': {0: 5},
+    'V': {0: 5},
+    'W': {0: 5, 1: 6},
+    'Y': {0: 5, 1: 6},
+}
+# ------------------------------------------------------------------------------
+# Visualization
+# ------------------------------------------------------------------------------
+# PyMOL colors, see: https://pymolwiki.org/index.php/Color_Values#Chemical_element_colours
+colors_dic = ['#33ff33', '#3333ff', '#ff4d4d', '#e6c540', '#ffb5b5', '#A62929', '#1FF01F', '#ff8000', '#940094', '#B3FFFF', '#b3e3f5']
+radius_dic = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]
+# ------------------------------------------------------------------------------
+# Backbone geometry
+# Taken from: Bhagavan, N. V., and C. E. Ha.
+# "Chapter 4-Three-dimensional structure of proteins and disorders of protein misfolding."
+# Essentials of Medical Biochemistry (2015): 31-51.
+# https://www.sciencedirect.com/science/article/pii/B978012416687500004X
+# ------------------------------------------------------------------------------
+N_CA_DIST = 1.47
+CA_C_DIST = 1.53
+N_CA_C_ANGLE = 110 * np.pi / 180
+# ------------------------------------------------------------------------------
+# Atom radii
+# ------------------------------------------------------------------------------
+# # https://en.wikipedia.org/wiki/Covalent_radius#Radii_for_multiple_bonds
+# # (2023/04/14)
+# covalent_radii = {'H': [32, None, None],
+#                   'C': [75, 67, 60],
+#                   'N': [71, 60, 54],
+#                   'O': [63, 57, 53],
+#                   'F': [64, 59, 53],
+#                   'B': [85, 78, 73],
+#                   'Al': [126, 113, 111],
+#                   'Si': [116, 107, 102],
+#                   'P': [111, 102, 94],
+#                   'S': [103, 94, 95],
+#                   'Cl': [99, 95, 93],
+#                   'As': [121, 114, 106],
+#                   'Br': [114, 109, 110],
+#                   'I': [133, 129, 125],
+#                   'Hg': [133, 142, None],
+#                   'Bi': [151, 141, 135]}
+# source: https://en.wikipedia.org/wiki/Van_der_Waals_radius
+vdw_radii = {'N': 1.55, 'O': 1.52, 'C': 1.70, 'H': 1.10, 'S': 1.80, 'P': 1.80,
+             'Se': 1.90, 'K': 2.75, 'Na': 2.27, 'Mg': 1.73, 'Zn': 1.39, 'B': 1.92,
+             'Br': 1.85, 'Cl': 1.75, 'I': 1.98, 'F': 1.47}
+WEBDATASET_SHARD_SIZE = 50000
+WEBDATASET_VAL_SIZE = 100

src/data/data_utils.py ADDED Viewed

	@@ -0,0 +1,901 @@

+import io
+from itertools import accumulate, chain
+from copy import deepcopy
+import random
+import torch
+import torch.nn.functional as F
+import numpy as np
+from rdkit import Chem
+from torch_scatter import scatter_mean
+from Bio.PDB import StructureBuilder, Chain, Model, Structure
+from Bio.PDB.PICIO import read_PIC, write_PIC
+from scipy.ndimage import gaussian_filter
+from pdb import set_trace
+from src.constants import FLOAT_TYPE, INT_TYPE
+from src.constants import atom_encoder, bond_encoder, aa_encoder, residue_encoder, residue_bond_encoder, aa_atom_index
+from src import utils
+from src.data.misc import protein_letters_3to1, is_aa
+from src.data.normal_modes import pdb_to_normal_modes
+from src.data.nerf import get_nerf_params, ic_to_coords
+import src.data.so3_utils as so3
+class TensorDict(dict):
+    def __init__(self, **kwargs):
+        super(TensorDict, self).__init__(**kwargs)
+    def _apply(self, func: str, *args, **kwargs):
+        """ Apply function to all tensors. """
+        for k, v in self.items():
+            if torch.is_tensor(v):
+                self[k] = getattr(v, func)(*args, **kwargs)
+        return self
+    # def to(self, device):
+    #     for k, v in self.items():
+    #         if torch.is_tensor(v):
+    #             self[k] = v.to(device)
+    #     return self
+    def cuda(self):
+        return self.to('cuda')
+    def cpu(self):
+        return self.to('cpu')
+    def to(self, device):
+        return self._apply("to", device)
+    def detach(self):
+        return self._apply("detach")
+    def __repr__(self):
+        def val_to_str(val):
+            if isinstance(val, torch.Tensor):
+                # if val.isnan().any():
+                #     return "(!nan)"
+                return "%r" % list(val.size())
+            if isinstance(val, list):
+                return "[%r,]" % len(val)
+            else:
+                return "?"
+        return f"{type(self).__name__}({', '.join(f'{k}={val_to_str(v)}' for k, v in self.items())})"
+def collate_entity(batch):
+    out = {}
+    for prop in batch[0].keys():
+        if prop == 'name':
+            out[prop] = [x[prop] for x in batch]
+        elif prop == 'size' or prop == 'n_bonds':
+            out[prop] = torch.tensor([x[prop] for x in batch])
+        elif prop == 'bonds':
+            # index offset
+            offset = list(accumulate([x['size'] for x in batch], initial=0))
+            out[prop] = torch.cat([x[prop] + offset[i] for i, x in enumerate(batch)], dim=1)
+        elif prop == 'residues':
+            out[prop] = list(chain.from_iterable(x[prop] for x in batch))
+        elif prop in {'mask', 'bond_mask'}:
+            pass  # batch masks will be written later
+        else:
+            out[prop] = torch.cat([x[prop] for x in batch], dim=0)
+        # Create batch masks
+        # make sure indices in batch start at zero (needed for torch_scatter)
+        if prop == 'x':
+            out['mask'] = torch.cat([i * torch.ones(len(x[prop]), dtype=torch.int64, device=x[prop].device)
+                                     for i, x in enumerate(batch)], dim=0)
+        if prop == 'bond_one_hot':
+            # TODO: this is not necessary as it can be computed on-the-fly as bond_mask = mask[bonds[0]] or bond_mask = mask[bonds[1]]
+            out['bond_mask'] = torch.cat([i * torch.ones(len(x[prop]), dtype=torch.int64, device=x[prop].device)
+                                          for i, x in enumerate(batch)], dim=0)
+    return out
+def split_entity(
+        batch,
+        *,
+        index_types={'bonds'},
+        edge_types={'bond_one_hot', 'bond_mask'},
+        no_split={'name', 'size', 'n_bonds'},
+        skip={'fragments'},
+        batch_mask=None,
+        edge_mask=None
+    ):
+    """ Splits a batch into items and returns a list. """
+    batch_mask = batch["mask"] if batch_mask is None else batch_mask
+    edge_mask = batch["bond_mask"] if edge_mask is None else edge_mask
+    sizes = batch['size'] if 'size' in batch else torch.unique(batch_mask, return_counts=True)[1].tolist()
+    batch_size = len(torch.unique(batch['mask']))
+    out = {}
+    for prop in batch.keys():
+        if prop in skip:
+            continue
+        if prop in no_split:
+            out[prop] = batch[prop]  # already a list
+        elif prop in index_types:
+            offsets = list(accumulate(sizes[:-1], initial=0))
+            out[prop] = utils.batch_to_list_for_indices(batch[prop], edge_mask, offsets)
+        elif prop in edge_types:
+            out[prop] = utils.batch_to_list(batch[prop], edge_mask)
+        else:
+            out[prop] = utils.batch_to_list(batch[prop], batch_mask)
+    out = [{k: v[i] for k, v in out.items()} for i in range(batch_size)]
+    return out
+def repeat_items(batch, repeats):
+    batch_list = split_entity(batch)
+    out = collate_entity([x for _ in range(repeats) for x in batch_list])
+    return type(batch)(**out)
+def get_side_chain_bead_coord(biopython_residue):
+    """
+    Places side chain bead at the location of the farthest side chain atom.
+    """
+    if biopython_residue.get_resname() == 'GLY':
+        return None
+    if biopython_residue.get_resname() == 'ALA':
+        return biopython_residue['CB'].get_coord()
+    ca_coord = biopython_residue['CA'].get_coord()
+    side_chain_atoms = [a for a in biopython_residue.get_atoms() if
+                        a.id not in {'N', 'CA', 'C', 'O'} and a.element != 'H']
+    side_chain_coords = np.stack([a.get_coord() for a in side_chain_atoms])
+    atom_idx = np.argmax(np.sum((side_chain_coords - ca_coord[None, :]) ** 2, axis=-1))
+    return side_chain_coords[atom_idx, :]
+def get_side_chain_vectors(res, index_dict, size=None):
+    if size is None:
+        size = max([x for aa in index_dict.values() for x in aa.values()]) + 1
+    resname = protein_letters_3to1[res.get_resname()]
+    out = np.zeros((size, 3))
+    for atom in res.get_atoms():
+        if atom.get_name() in index_dict[resname]:
+            idx = index_dict[resname][atom.get_name()]
+            out[idx] = atom.get_coord() - res['CA'].get_coord()
+        # else:
+        #     if atom.get_name() != 'CA' and not atom.get_name().startswith('H'):
+        #         print(resname, atom.get_name())
+    return out
+def get_normal_modes(res, normal_mode_dict):
+    nm = normal_mode_dict[(res.get_parent().id, res.id[1], 'CA')]  # (n_modes, 3)
+    return nm
+def get_torsion_angles(res, device=None):
+    """
+    Return the five chi angles. Missing angles are filled with zeros.
+    """
+    ANGLES = ['chi1', 'chi2', 'chi3', 'chi4', 'chi5']
+    ic_res = res.internal_coord
+    chi_angles = [ic_res.get_angle(chi) for chi in ANGLES]
+    chi_angles = [chi if chi is not None else float('nan') for chi in chi_angles]
+    return torch.tensor(chi_angles, device=device) * np.pi / 180
+def apply_torsion_angles(res, chi_angles):
+    """
+    Set side chain torsion angles of a biopython residue object with
+    internal coordinates.
+    """
+    ANGLES = ['chi1', 'chi2', 'chi3', 'chi4', 'chi5']
+    chi_angles = chi_angles * 180 / np.pi
+    # res.parent.internal_coord.build_atomArray()  # rebuild atom pointers
+    ic_res = res.internal_coord
+    for chi, angle in zip(ANGLES, chi_angles):
+        if ic_res.pick_angle(chi) is None:
+            continue
+        ic_res.bond_set(chi, angle)
+    res.parent.internal_to_atom_coordinates(verbose=False)
+    # res.parent.internal_coord.init_atom_coords()
+    # res.internal_coord.assemble()
+    return res
+def prepare_internal_coord(res):
+    # Make new structure with a single residue
+    new_struct = Structure.Structure('X')
+    new_struct.header = {}
+    new_model = Model.Model(0)
+    new_struct.add(new_model)
+    new_chain = Chain.Chain('X')
+    new_model.add(new_chain)
+    new_chain.add(res)
+    res.set_parent(new_chain)  # update pointer
+    # Compute internal coordinates
+    new_chain.atom_to_internal_coordinates()
+    pic_io = io.StringIO()
+    write_PIC(new_struct, pic_io)
+    return pic_io.getvalue()
+def residue_from_internal_coord(ic_string):
+    pic_io = io.StringIO(ic_string)
+    struct = read_PIC(pic_io, quick=True)
+    res = struct.child_list[0].child_list[0].child_list[0]
+    res.parent.internal_to_atom_coordinates(verbose=False)
+    return res
+def prepare_pocket(biopython_residues, amino_acid_encoder, residue_encoder,
+                   residue_bond_encoder, pocket_representation='side_chain_bead',
+                   compute_nerf_params=False, compute_bb_frames=False,
+                   nma_input=None):
+    assert nma_input is None or pocket_representation == 'CA+', \
+        "vector features are only supported for CA+ pockets"
+    # sort residues
+    biopython_residues = sorted(biopython_residues, key=lambda x: (x.parent.id, x.id[1]))
+    if nma_input is not None:
+        # preprocessed normal mode eigenvectors
+        if isinstance(nma_input, dict):
+            nma_dict = nma_input
+        # PDB file
+        else:
+            nma_dict = pdb_to_normal_modes(str(nma_input))
+    if pocket_representation == 'side_chain_bead':
+        ca_coords = np.zeros((len(biopython_residues), 3))
+        ca_types = np.zeros(len(biopython_residues), dtype='int64')
+        side_chain_coords = []
+        side_chain_aa_types = []
+        edges = []  # CA-CA and CA-side_chain
+        edge_types = []
+        last_res_id = None
+        for i, res in enumerate(biopython_residues):
+            aa = amino_acid_encoder[protein_letters_3to1[res.get_resname()]]
+            ca_coords[i, :] = res['CA'].get_coord()
+            ca_types[i] = aa
+            side_chain_coord = get_side_chain_bead_coord(res)
+            if side_chain_coord is not None:
+                side_chain_coords.append(side_chain_coord)
+                side_chain_aa_types.append(aa)
+                edges.append((i, len(ca_coords) + len(side_chain_coords) - 1))
+                edge_types.append(residue_bond_encoder['CA-SS'])
+            # add edges between contiguous CA atoms
+            if i > 0 and res.id[1] == last_res_id + 1:
+                edges.append((i - 1, i))
+                edge_types.append(residue_bond_encoder['CA-CA'])
+            last_res_id = res.id[1]
+        # Coordinates
+        side_chain_coords = np.stack(side_chain_coords)
+        pocket_coords = np.concatenate([ca_coords, side_chain_coords], axis=0)
+        pocket_coords = torch.from_numpy(pocket_coords)
+        # Features
+        amino_acid_onehot = F.one_hot(
+            torch.cat([torch.from_numpy(ca_types), torch.tensor(side_chain_aa_types, dtype=torch.int64)], dim=0),
+            num_classes=len(amino_acid_encoder)
+        )
+        side_chain_onehot = np.concatenate([
+            np.tile(np.eye(1, len(residue_encoder), residue_encoder['CA']),
+                    [len(ca_coords), 1]),
+            np.tile(np.eye(1, len(residue_encoder), residue_encoder['SS']),
+                    [len(side_chain_coords), 1])
+        ], axis=0)
+        side_chain_onehot = torch.from_numpy(side_chain_onehot)
+        pocket_onehot = torch.cat([amino_acid_onehot, side_chain_onehot], dim=1)
+        vector_features = None
+        nma_features = None
+        # Bonds
+        edges = torch.tensor(edges).T
+        edge_types = F.one_hot(torch.tensor(edge_types), num_classes=len(residue_bond_encoder))
+    elif pocket_representation == 'CA+':
+        ca_coords = np.zeros((len(biopython_residues), 3))
+        ca_types = np.zeros(len(biopython_residues), dtype='int64')
+        v_dim = max([x for aa in aa_atom_index.values() for x in aa.values()]) + 1
+        vec_feats = np.zeros((len(biopython_residues), v_dim, 3), dtype='float32')
+        nf_nma = 5
+        nma_feats = np.zeros((len(biopython_residues), nf_nma, 3), dtype='float32')
+        edges = []  # CA-CA and CA-side_chain
+        edge_types = []
+        last_res_id = None
+        for i, res in enumerate(biopython_residues):
+            aa = amino_acid_encoder[protein_letters_3to1[res.get_resname()]]
+            ca_coords[i, :] = res['CA'].get_coord()
+            ca_types[i] = aa
+            vec_feats[i] = get_side_chain_vectors(res, aa_atom_index, v_dim)
+            if nma_input is not None:
+                nma_feats[i] = get_normal_modes(res, nma_dict)
+            # add edges between contiguous CA atoms
+            if i > 0 and res.id[1] == last_res_id + 1:
+                edges.append((i - 1, i))
+                edge_types.append(residue_bond_encoder['CA-CA'])
+            last_res_id = res.id[1]
+        # Coordinates
+        pocket_coords = torch.from_numpy(ca_coords)
+        # Features
+        pocket_onehot = F.one_hot(torch.from_numpy(ca_types),
+                                  num_classes=len(amino_acid_encoder))
+        vector_features = torch.from_numpy(vec_feats)
+        nma_features = torch.from_numpy(nma_feats)
+        # Bonds
+        if len(edges) < 1:
+            edges = torch.empty(2, 0)
+            edge_types = torch.empty(0, len(residue_bond_encoder))
+        else:
+            edges = torch.tensor(edges).T
+            edge_types = F.one_hot(torch.tensor(edge_types),
+                                   num_classes=len(residue_bond_encoder))
+    else:
+        raise NotImplementedError(
+            f"Pocket representation '{pocket_representation}' not implemented")
+    # pocket_ids = [f'{res.parent.id}:{res.id[1]}' for res in biopython_residues]
+    pocket = {
+        'x': pocket_coords.to(dtype=FLOAT_TYPE),
+        'one_hot': pocket_onehot.to(dtype=FLOAT_TYPE),
+        # 'ids': pocket_ids,
+        'size': torch.tensor([len(pocket_coords)], dtype=INT_TYPE),
+        'mask': torch.zeros(len(pocket_coords), dtype=INT_TYPE),
+        'bonds': edges.to(INT_TYPE),
+        'bond_one_hot': edge_types.to(FLOAT_TYPE),
+        'bond_mask': torch.zeros(edges.size(1), dtype=INT_TYPE),
+        'n_bonds': torch.tensor([len(edge_types)], dtype=INT_TYPE),
+    }
+    if vector_features is not None:
+        pocket['v'] = vector_features.to(dtype=FLOAT_TYPE)
+    if nma_input is not None:
+        pocket['nma_vec'] = nma_features.to(dtype=FLOAT_TYPE)
+    if compute_nerf_params:
+        nerf_params = [get_nerf_params(r) for r in biopython_residues]
+        nerf_params = {k: torch.stack([x[k] for x in nerf_params], dim=0)
+                       for k in nerf_params[0].keys()}
+        pocket.update(nerf_params)
+    if compute_bb_frames:
+        n_xyz = torch.from_numpy(np.stack([r['N'].get_coord() for r in biopython_residues]))
+        ca_xyz = torch.from_numpy(np.stack([r['CA'].get_coord() for r in biopython_residues]))
+        c_xyz = torch.from_numpy(np.stack([r['C'].get_coord() for r in biopython_residues]))
+        pocket['axis_angle'], _ = get_bb_transform(n_xyz, ca_xyz, c_xyz)
+    return pocket, biopython_residues
+def encode_atom(rd_atom, atom_encoder):
+    element = rd_atom.GetSymbol().capitalize()
+    explicitHs = rd_atom.GetNumExplicitHs()
+    if explicitHs == 1 and f'{element}H' in atom_encoder:
+        return atom_encoder[f'{element}H']
+    charge = rd_atom.GetFormalCharge()
+    if charge == 1 and f'{element}+' in atom_encoder:
+        return atom_encoder[f'{element}+']
+    if charge == -1 and f'{element}-' in atom_encoder:
+        return atom_encoder[f'{element}-']
+    return atom_encoder[element]
+def prepare_ligand(rdmol, atom_encoder, bond_encoder):
+    # remove H atoms if not in atom_encoder
+    if 'H' not in atom_encoder:
+        rdmol = Chem.RemoveAllHs(rdmol, sanitize=False)
+    # Coordinates
+    ligand_coord = rdmol.GetConformer().GetPositions()
+    ligand_coord = torch.from_numpy(ligand_coord)
+    # Features
+    ligand_onehot = F.one_hot(
+        torch.tensor([encode_atom(a, atom_encoder) for a in rdmol.GetAtoms()]),
+        num_classes=len(atom_encoder)
+    )
+    # Bonds
+    adj = np.ones((rdmol.GetNumAtoms(), rdmol.GetNumAtoms())) * bond_encoder['NOBOND']
+    for b in rdmol.GetBonds():
+        i = b.GetBeginAtomIdx()
+        j = b.GetEndAtomIdx()
+        adj[i, j] = bond_encoder[str(b.GetBondType())]
+        adj[j, i] = adj[i, j]  # undirected graph
+    # molecular graph is undirected -> don't save redundant information
+    bonds = np.stack(np.triu_indices(len(ligand_coord), k=1), axis=0)
+    # bonds = np.stack(np.ones_like(adj).nonzero(), axis=0)
+    bond_types = adj[bonds[0], bonds[1]].astype('int64')
+    bonds = torch.from_numpy(bonds)
+    bond_types = F.one_hot(torch.from_numpy(bond_types), num_classes=len(bond_encoder))
+    ligand = {
+        'x': ligand_coord.to(dtype=FLOAT_TYPE),
+        'one_hot': ligand_onehot.to(dtype=FLOAT_TYPE),
+        'mask': torch.zeros(len(ligand_coord), dtype=INT_TYPE),
+        'bonds': bonds.to(INT_TYPE),
+        'bond_one_hot': bond_types.to(FLOAT_TYPE),
+        'bond_mask': torch.zeros(bonds.size(1), dtype=INT_TYPE),
+        'size': torch.tensor([len(ligand_coord)], dtype=INT_TYPE),
+        'n_bonds': torch.tensor([len(bond_types)], dtype=INT_TYPE),
+    }
+    return ligand
+def process_raw_molecule_with_empty_pocket(rdmol):
+    ligand = prepare_ligand(rdmol, atom_encoder, bond_encoder)
+    pocket = {
+        'x': torch.tensor([], dtype=FLOAT_TYPE),
+        'one_hot': torch.tensor([], dtype=FLOAT_TYPE),
+        'size': torch.tensor([], dtype=INT_TYPE),
+        'mask': torch.tensor([], dtype=INT_TYPE),
+        'bonds': torch.tensor([], dtype=INT_TYPE),
+        'bond_one_hot': torch.tensor([], dtype=FLOAT_TYPE),
+        'bond_mask': torch.tensor([], dtype=INT_TYPE),
+        'n_bonds': torch.tensor([], dtype=INT_TYPE),
+    }
+    return ligand, pocket
+def process_raw_pair(biopython_model, rdmol, dist_cutoff=None,
+                     pocket_representation='side_chain_bead',
+                     compute_nerf_params=False, compute_bb_frames=False,
+                     nma_input=None, return_pocket_pdb=False):
+    # Process ligand
+    ligand = prepare_ligand(rdmol, atom_encoder, bond_encoder)
+    # Find interacting pocket residues based on distance cutoff
+    pocket_residues = []
+    for residue in biopython_model.get_residues():
+        # Remove non-standard amino acids and HETATMs
+        if not is_aa(residue.get_resname(), standard=True):
+            continue
+        res_coords = torch.from_numpy(np.array([a.get_coord() for a in residue.get_atoms()]))
+        if dist_cutoff is None or (((res_coords[:, None, :] - ligand['x'][None, :, :]) ** 2).sum(-1) ** 0.5).min() < dist_cutoff:
+            pocket_residues.append(residue)
+    pocket, pocket_residues = prepare_pocket(
+        pocket_residues, aa_encoder, residue_encoder, residue_bond_encoder,
+        pocket_representation, compute_nerf_params, compute_bb_frames, nma_input
+    )
+    if return_pocket_pdb:
+        builder = StructureBuilder.StructureBuilder()
+        builder.init_structure("")
+        builder.init_model(0)
+        pocket_struct = builder.get_structure()
+        for residue in pocket_residues:
+            chain = residue.get_parent().get_id()
+            # init chain if necessary
+            if not pocket_struct[0].has_id(chain):
+                builder.init_chain(chain)
+            # add residue
+            pocket_struct[0][chain].add(residue)
+        pocket['pocket_pdb'] = pocket_struct
+    # if return_pocket_pdb:
+    #     pocket['residues'] = [prepare_internal_coord(res) for res in pocket_residues]
+    return ligand, pocket
+class AppendVirtualNodes:
+    def __init__(self, atom_encoder, bond_encoder, max_ligand_size, scale=1.0):
+        self.max_size = max_ligand_size
+        self.atom_encoder = atom_encoder
+        self.bond_encoder = bond_encoder
+        self.vidx = atom_encoder['NOATOM']
+        self.bidx = bond_encoder['NOBOND']
+        self.scale = scale
+    def __call__(self, ligand, max_size=None, eps=1e-6):
+        if max_size is None:
+            max_size = self.max_size
+        n_virt = max_size - ligand['size']
+        C = torch.cov(ligand['x'].T)
+        L = torch.linalg.cholesky(C + torch.eye(3) * eps)
+        mu = ligand['x'].mean(0, keepdim=True)
+        virt_coords = mu + torch.randn(n_virt, 3) @ L.T * self.scale
+        # insert virtual atom column
+        virt_one_hot = F.one_hot(torch.ones(n_virt, dtype=torch.int64) * self.vidx, num_classes=len(self.atom_encoder))
+        virt_mask = torch.cat([torch.zeros(ligand['size'], dtype=bool), torch.ones(n_virt, dtype=bool)])
+        ligand['x'] = torch.cat([ligand['x'], virt_coords])
+        ligand['one_hot'] = torch.cat(([ligand['one_hot'], virt_one_hot]))
+        ligand['virtual_mask'] = virt_mask
+        ligand['size'] = max_size
+        # Bonds
+        new_bonds = torch.triu_indices(max_size, max_size, offset=1)
+        bond_types = torch.ones(max_size, max_size, dtype=INT_TYPE) * self.bidx
+        row, col = ligand['bonds']
+        bond_types[row, col] = ligand['bond_one_hot'].argmax(dim=1)
+        new_row, new_col = new_bonds
+        bond_types = bond_types[new_row, new_col]
+        ligand['bonds'] = new_bonds
+        ligand['bond_one_hot'] = F.one_hot(bond_types, num_classes=len(self.bond_encoder)).to(ligand['bond_one_hot'].dtype)
+        ligand['n_bonds'] = len(ligand['bond_one_hot'])
+        return ligand
+class AppendVirtualNodesInCoM:
+    def __init__(self, atom_encoder, bond_encoder, add_min=0, add_max=10):
+        self.atom_encoder = atom_encoder
+        self.bond_encoder = bond_encoder
+        self.vidx = atom_encoder['NOATOM']
+        self.bidx = bond_encoder['NOBOND']
+        self.add_min = add_min
+        self.add_max = add_max
+    def __call__(self, ligand):
+        n_virt = random.randint(self.add_min, self.add_max)
+        # all virtual coordinates in the CoM
+        virt_coords = ligand['x'].mean(0, keepdim=True).repeat(n_virt, 1)
+        # insert virtual atom column
+        virt_one_hot = F.one_hot(torch.ones(n_virt, dtype=torch.int64) * self.vidx, num_classes=len(self.atom_encoder))
+        virt_mask = torch.cat([torch.zeros(ligand['size'], dtype=bool), torch.ones(n_virt, dtype=bool)])
+        ligand['x'] = torch.cat([ligand['x'], virt_coords])
+        ligand['one_hot'] = torch.cat(([ligand['one_hot'], virt_one_hot]))
+        ligand['virtual_mask'] = virt_mask
+        ligand['size'] = len(ligand['x'])
+        # Bonds
+        new_bonds = torch.triu_indices(ligand['size'], ligand['size'], offset=1)
+        bond_types = torch.ones(ligand['size'], ligand['size'], dtype=INT_TYPE) * self.bidx
+        row, col = ligand['bonds']
+        bond_types[row, col] = ligand['bond_one_hot'].argmax(dim=1)
+        new_row, new_col = new_bonds
+        bond_types = bond_types[new_row, new_col]
+        ligand['bonds'] = new_bonds
+        ligand['bond_one_hot'] = F.one_hot(bond_types, num_classes=len(self.bond_encoder)).to(ligand['bond_one_hot'].dtype)
+        ligand['n_bonds'] = len(ligand['bond_one_hot'])
+        return ligand
+def rdmol_to_smiles(rdmol):
+    mol = Chem.Mol(rdmol)
+    Chem.RemoveStereochemistry(mol)
+    mol = Chem.RemoveHs(mol)
+    return Chem.MolToSmiles(mol)
+def get_n_nodes(lig_positions, pocket_positions, smooth_sigma=None):
+    # Joint distribution of ligand's and pocket's number of nodes
+    n_nodes_lig = [len(x) for x in lig_positions]
+    n_nodes_pocket = [len(x) for x in pocket_positions]
+    joint_histogram = np.zeros((np.max(n_nodes_lig) + 1,
+                                np.max(n_nodes_pocket) + 1))
+    for nlig, npocket in zip(n_nodes_lig, n_nodes_pocket):
+        joint_histogram[nlig, npocket] += 1
+    print(f'Original histogram: {np.count_nonzero(joint_histogram)}/'
+          f'{joint_histogram.shape[0] * joint_histogram.shape[1]} bins filled')
+    # Smooth the histogram
+    if smooth_sigma is not None:
+        filtered_histogram = gaussian_filter(
+            joint_histogram, sigma=smooth_sigma, order=0, mode='constant',
+            cval=0.0, truncate=4.0)
+        print(f'Smoothed histogram: {np.count_nonzero(filtered_histogram)}/'
+              f'{filtered_histogram.shape[0] * filtered_histogram.shape[1]} bins filled')
+        joint_histogram = filtered_histogram
+    return joint_histogram
+# def get_type_histograms(lig_one_hot, pocket_one_hot, lig_encoder, pocket_encoder):
+#
+#     lig_one_hot = np.concatenate(lig_one_hot, axis=0)
+#     pocket_one_hot = np.concatenate(pocket_one_hot, axis=0)
+#
+#     atom_decoder = list(lig_encoder.keys())
+#     lig_counts = {k: 0 for k in lig_encoder.keys()}
+#     for a in [atom_decoder[x] for x in lig_one_hot.argmax(1)]:
+#         lig_counts[a] += 1
+#
+#     aa_decoder = list(pocket_encoder.keys())
+#     pocket_counts = {k: 0 for k in pocket_encoder.keys()}
+#     for r in [aa_decoder[x] for x in pocket_one_hot.argmax(1)]:
+#         pocket_counts[r] += 1
+#
+#     return lig_counts, pocket_counts
+def get_type_histogram(one_hot, type_encoder):
+    one_hot = np.concatenate(one_hot, axis=0)
+    decoder = list(type_encoder.keys())
+    counts = {k: 0 for k in type_encoder.keys()}
+    for a in [decoder[x] for x in one_hot.argmax(1)]:
+        counts[a] += 1
+    return counts
+def get_residue_with_resi(pdb_chain, resi):
+    res = [x for x in pdb_chain.get_residues() if x.id[1] == resi]
+    assert len(res) == 1
+    return res[0]
+def get_pocket_from_ligand(pdb_model, ligand, dist_cutoff=8.0):
+    if ligand.endswith(".sdf"):
+        # ligand as sdf file
+        rdmol = Chem.SDMolSupplier(str(ligand))[0]
+        ligand_coords = torch.from_numpy(rdmol.GetConformer().GetPositions()).float()
+        resi = None
+    else:
+        # ligand contained in PDB; given in <chain>:<resi> format
+        chain, resi = ligand.split(':')
+        ligand = get_residue_with_resi(pdb_model[chain], int(resi))
+        ligand_coords = torch.from_numpy(
+            np.array([a.get_coord() for a in ligand.get_atoms()]))
+    pocket_residues = []
+    for residue in pdb_model.get_residues():
+        if residue.id[1] == resi:
+            continue  # skip ligand itself
+        res_coords = torch.from_numpy(
+            np.array([a.get_coord() for a in residue.get_atoms()]))
+        if is_aa(residue.get_resname(), standard=True) \
+                and torch.cdist(res_coords, ligand_coords).min() < dist_cutoff:
+            pocket_residues.append(residue)
+    return pocket_residues
+def encode_residues(biopython_residues, type_encoder, level='atom',
+                    remove_H=True):
+    assert level in {'atom', 'residue'}
+    if level == 'atom':
+        entities = [a for res in biopython_residues for a in res.get_atoms()
+                    if (a.element != 'H' or not remove_H)]
+        types = [a.element.capitalize() for a in entities]
+    else:
+        entities = [res['CA'] for res in biopython_residues]
+        types = [protein_letters_3to1[res.get_resname()] for res in biopython_residues]
+    coord = torch.tensor(np.stack([e.get_coord() for e in entities]))
+    one_hot = F.one_hot(torch.tensor([type_encoder[t] for t in types]),
+                        num_classes=len(type_encoder))
+    return coord, one_hot
+def center_data(ligand, pocket):
+    if pocket['x'].numel() > 0:
+        pocket_com = pocket.center()
+    else:
+        pocket_com = scatter_mean(ligand['x'], ligand['mask'], dim=0)
+    ligand['x'] = ligand['x'] - pocket_com[ligand['mask']]
+    return ligand, pocket
+def get_bb_transform(n_xyz, ca_xyz, c_xyz):
+    """
+    Compute translation and rotation of the canoncical backbone frame (triangle N-Ca-C) from a position with
+    Ca at the origin, N on the x-axis and C in the xy-plane to the global position of the backbone frame
+    Args:
+        n_xyz: (n, 3)
+        ca_xyz: (n, 3)
+        c_xyz: (n, 3)
+    Returns:
+        axis-angle representation of the rotation, shape (n, 3)  # rotation matrix of shape (n, 3, 3)
+        translation vector of shape (n, 3)
+    """
+    def rotation_matrix(angle, axis):
+        axis_mapping = {'x': 0, 'y': 1, 'z': 2}
+        axis = axis_mapping[axis]
+        vector = torch.zeros(len(angle), 3)
+        vector[:, axis] = 1
+        # return axis_angle_to_matrix(angle * vector)
+        return so3.matrix_from_rotation_vector(angle.view(-1, 1) * vector)
+    translation = ca_xyz
+    n_xyz = n_xyz - translation
+    c_xyz = c_xyz - translation
+    # Find rotation matrix that aligns the coordinate systems
+    # rotate around y-axis to move N into the xy-plane
+    theta_y = torch.arctan2(n_xyz[:, 2], -n_xyz[:, 0])
+    Ry = rotation_matrix(theta_y, 'y')
+    Ry = Ry.transpose(2, 1)
+    n_xyz = torch.einsum('noi,ni->no', Ry, n_xyz)
+    # rotate around z-axis to move N onto the x-axis
+    theta_z = torch.arctan2(n_xyz[:, 1], n_xyz[:, 0])
+    Rz = rotation_matrix(theta_z, 'z')
+    Rz = Rz.transpose(2, 1)
+    # print(torch.einsum('noi,ni->no', Rz, n_xyz))
+    # n_xyz = torch.einsum('noi,ni->no', Rz.transpose(0, 2, 1), n_xyz)
+    # rotate around x-axis to move C into the xy-plane
+    c_xyz = torch.einsum('noj,nji,ni->no', Rz, Ry, c_xyz)
+    theta_x = torch.arctan2(c_xyz[:, 2], c_xyz[:, 1])
+    Rx = rotation_matrix(theta_x, 'x')
+    Rx = Rx.transpose(2, 1)
+    # print(torch.einsum('noi,ni->no', Rx, c_xyz))
+    # Final rotation matrix
+    Ry = Ry.transpose(2, 1)
+    Rz = Rz.transpose(2, 1)
+    Rx = Rx.transpose(2, 1)
+    R = torch.einsum('nok,nkj,nji->noi', Ry, Rz, Rx)
+    # return R, translation
+    # return matrix_to_axis_angle(R), translation
+    return so3.rotation_vector_from_matrix(R), translation
+class Residues(TensorDict):
+    """
+    Dictionary-like container for residues that supports some basic transformations.
+    """
+    # all keys
+    KEYS = {'x', 'one_hot', 'bonds', 'bond_one_hot', 'v', 'nma_vec', 'fixed_coord',
+            'atom_mask', 'nerf_indices', 'length', 'theta', 'chi', 'ddihedral',
+            'chi_indices', 'axis_angle', 'mask', 'bond_mask'}
+    # coordinate-type values, shape (..., 3)
+    COORD_KEYS = {'x', 'fixed_coord'}
+    # vector-type values, shape (n_residues, n_feat, 3)
+    VECTOR_KEYS = {'v', 'nma_vec'}
+    # properties that change if the side chains and/or backbones are updated
+    MUTABLE_PROPS_SS_AND_BB = {'v'}
+    # properties that only change if the side chains are updated
+    MUTABLE_PROPS_SS = {'chi'}
+    # properties that only change if the backbones are updated
+    MUTABLE_PROPS_BB = {'x', 'fixed_coord', 'axis_angle', 'nma_vec'}
+    # properties that remain fixed in all cases
+    IMMUTABLE_PROPS = {'mask', 'one_hot', 'bonds', 'bond_one_hot', 'bond_mask',
+                       'atom_mask', 'nerf_indices', 'length', 'theta',
+                       'ddihedral', 'chi_indices', 'name', 'size', 'n_bonds'}
+    def copy(self):
+        data = super().copy()
+        return Residues(**data)
+    def deepcopy(self):
+        data = {k: v.clone() if torch.is_tensor(v) else deepcopy(v)
+                for k, v in self.items()}
+        return Residues(**data)
+    def center(self):
+        com = scatter_mean(self['x'], self['mask'], dim=0)
+        self['x'] = self['x'] - com[self['mask']]
+        self['fixed_coord'] = self['fixed_coord'] - com[self['mask']].unsqueeze(1)
+        return com
+    def set_empty_v(self):
+        self['v'] = torch.tensor([], device=self['x'].device)
+    @torch.no_grad()
+    def set_chi(self, chi_angles):
+        self['chi'][:, :5] = chi_angles
+        nerf_params = {k: self[k] for k in ['fixed_coord', 'atom_mask',
+                                            'nerf_indices', 'length', 'theta',
+                                            'chi', 'ddihedral', 'chi_indices']}
+        self['v'] = ic_to_coords(**nerf_params) - self['x'].unsqueeze(1)
+    @torch.no_grad()
+    def set_frame(self, new_ca_coord, new_axis_angle):
+        bb_coord = self['fixed_coord']
+        bb_coord = bb_coord - self['x'].unsqueeze(1)
+        rotmat_before = so3.matrix_from_rotation_vector(self['axis_angle'])
+        rotmat_after = so3.matrix_from_rotation_vector(new_axis_angle)
+        rotmat_diff = rotmat_after @ rotmat_before.transpose(-1, -2)
+        bb_coord = torch.einsum('boi,bai->bao', rotmat_diff, bb_coord)
+        bb_coord = bb_coord + new_ca_coord.unsqueeze(1)
+        self['x'] = new_ca_coord
+        self['axis_angle'] = new_axis_angle
+        self['fixed_coord'] = bb_coord
+        self['v'] = torch.einsum('boi,bai->bao', rotmat_diff, self['v'])
+    @staticmethod
+    def empty(device):
+        return Residues(
+            x=torch.zeros(1, 3, device=device).float(),
+            mask=torch.zeros(1, 1, device=device).long(),
+            size=torch.zeros(1, device=device).long(),
+        )
+def randomize_tensors(tensor_dict, exclude_keys=None):
+    """Replace tensors with random tensors with the same shape."""
+    exclude_keys = set() if exclude_keys is None else set(exclude_keys)
+    for k, v in tensor_dict.items():
+        if isinstance(v, torch.Tensor) and k not in exclude_keys:
+            if torch.is_floating_point(v):
+                tensor_dict[k] = torch.randn_like(v)
+            else:
+                tensor_dict[k] = torch.randint_like(v, low=-42, high=42)
+    return tensor_dict

src/data/dataset.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import io
+import random
+import warnings
+import torch
+import webdataset as wds
+from pathlib import Path
+from torch.utils.data import Dataset
+from src.data.data_utils import TensorDict, collate_entity
+from src.constants import WEBDATASET_SHARD_SIZE, WEBDATASET_VAL_SIZE
+class ProcessedLigandPocketDataset(Dataset):
+    def __init__(self, pt_path, ligand_transform=None, pocket_transform=None,
+                 catch_errors=False):
+        self.ligand_transform = ligand_transform
+        self.pocket_transform = pocket_transform
+        self.catch_errors = catch_errors
+        self.pt_path = pt_path
+        self.data = torch.load(pt_path)
+        # add number of nodes for convenience
+        for entity in ['ligands', 'pockets']:
+            self.data[entity]['size'] = torch.tensor([len(x) for x in self.data[entity]['x']])
+            self.data[entity]['n_bonds'] = torch.tensor([len(x) for x in self.data[entity]['bond_one_hot']])
+    def __len__(self):
+        return len(self.data['ligands']['name'])
+    def __getitem__(self, idx):
+        data = {}
+        data['ligand'] = {key: val[idx] for key, val in self.data['ligands'].items()}
+        data['pocket'] = {key: val[idx] for key, val in self.data['pockets'].items()}
+        try:
+            if self.ligand_transform is not None:
+                data['ligand'] = self.ligand_transform(data['ligand'])
+            if self.pocket_transform is not None:
+                data['pocket'] = self.pocket_transform(data['pocket'])
+        except (RuntimeError, ValueError) as e:
+            if self.catch_errors:
+                warnings.warn(f"{type(e).__name__}('{e}') in data transform. "
+                              f"Returning random item instead")
+                # replace bad item with a random one
+                rand_idx = random.randint(0, len(self) - 1)
+                return self[rand_idx]
+            else:
+                raise e
+        return data
+    @staticmethod
+    def collate_fn(batch_pairs, ligand_transform=None):
+        out = {}
+        for entity in ['ligand', 'pocket']:
+            batch = [x[entity] for x in batch_pairs]
+            if entity == 'ligand' and ligand_transform is not None:
+                max_size = max(x['size'].item() for x in batch)
+                # TODO: might have to remove elements from batch if processing fails, warn user in that case
+                batch = [ligand_transform(x, max_size=max_size) for x in batch]
+            out[entity] = TensorDict(**collate_entity(batch))
+        return out
+class ClusteredDataset(ProcessedLigandPocketDataset):
+    def __init__(self, pt_path, ligand_transform=None, pocket_transform=None,
+                 catch_errors=False):
+        super().__init__(pt_path, ligand_transform, pocket_transform, catch_errors)
+        self.clusters = list(self.data['clusters'].values())
+    def __len__(self):
+        return len(self.clusters)
+    def __getitem__(self, cidx):
+        cluster_inds = self.clusters[cidx]
+        # idx = cluster_inds[random.randint(0, len(cluster_inds) - 1)]
+        idx = random.choice(cluster_inds)
+        return super().__getitem__(idx)
+class DPODataset(ProcessedLigandPocketDataset):
+    def __init__(self, pt_path, ligand_transform=None, pocket_transform=None,
+                 catch_errors=False):
+        self.ligand_transform = ligand_transform
+        self.pocket_transform = pocket_transform
+        self.catch_errors = catch_errors
+        self.pt_path = pt_path
+        self.data = torch.load(pt_path)
+        if not 'pockets' in self.data:
+            self.data['pockets'] = self.data['pockets_w']
+        if not 'ligands' in self.data:
+            self.data['ligands'] = self.data['ligands_w']
+        if (
+            len(self.data["ligands"]["name"])
+            != len(self.data["ligands_l"]["name"])
+            != len(self.data["pockets"]["name"])
+        ):
+            raise ValueError(
+                "Error while importing DPO Dataset: Number of ligands winning, ligands losing and pockets must be the same"
+            )
+        # add number of nodes for convenience
+        for entity in ['ligands', 'ligands_l', 'pockets']:
+            self.data[entity]['size'] = torch.tensor([len(x) for x in self.data[entity]['x']])
+            self.data[entity]['n_bonds'] = torch.tensor([len(x) for x in self.data[entity]['bond_one_hot']])
+    def __len__(self):
+        return len(self.data["ligands"]["name"])
+    def __getitem__(self, idx):
+        data = {}
+        data['ligand'] = {key: val[idx] for key, val in self.data['ligands'].items()}
+        data['ligand_l'] = {key: val[idx] for key, val in self.data['ligands_l'].items()}
+        data['pocket'] = {key: val[idx] for key, val in self.data['pockets'].items()}
+        try:
+            if self.ligand_transform is not None:
+                data['ligand'] = self.ligand_transform(data['ligand'])
+                data['ligand_l'] = self.ligand_transform(data['ligand_l'])
+            if self.pocket_transform is not None:
+                data['pocket'] = self.pocket_transform(data['pocket'])
+        except (RuntimeError, ValueError) as e:
+            if self.catch_errors:
+                warnings.warn(f"{type(e).__name__}('{e}') in data transform. "
+                              f"Returning random item instead")
+                # replace bad item with a random one
+                rand_idx = random.randint(0, len(self) - 1)
+                return self[rand_idx]
+            else:
+                raise e
+        return data
+    @staticmethod
+    def collate_fn(batch_pairs, ligand_transform=None):
+        out = {}
+        for entity in ['ligand', 'ligand_l', 'pocket']:
+            batch = [x[entity] for x in batch_pairs]
+            if entity in ['ligand', 'ligand_l'] and ligand_transform is not None:
+                max_size = max(x['size'].item() for x in batch)
+                batch = [ligand_transform(x, max_size=max_size) for x in batch]
+            out[entity] = TensorDict(**collate_entity(batch))
+        return out
+##########################################
+############### WebDatasets ##############
+##########################################
+class ProteinLigandWebDataset(wds.WebDataset):
+    @staticmethod
+    def collate_fn(batch_pairs, ligand_transform=None):
+        return ProcessedLigandPocketDataset.collate_fn(batch_pairs, ligand_transform)
+def wds_decoder(key, value):
+    return torch.load(io.BytesIO(value))
+def preprocess_wds_item(data):
+    out = {}
+    for entity in ['ligand', 'pocket']:
+        out[entity] = data['pt'][entity]
+        for attr in ['size', 'n_bonds']:
+            if torch.is_tensor(out[entity][attr]):
+                assert len(out[entity][attr]) == 0
+                out[entity][attr] = 0
+    return out
+def get_wds(data_path, stage, ligand_transform=None, pocket_transform=None):
+    current_data_dir = Path(data_path, stage)
+    shards = sorted(current_data_dir.glob('shard-?????.tar'), key=lambda s: int(s.name.split('-')[-1].split('.')[0]))
+    min_shard = min(shards).name.split('-')[-1].split('.')[0]
+    max_shard = max(shards).name.split('-')[-1].split('.')[0]
+    total_size = (int(max_shard) - int(min_shard) + 1) * WEBDATASET_SHARD_SIZE if stage == 'train' else WEBDATASET_VAL_SIZE
+    url = f'{data_path}/{stage}/shard-{{{min_shard}..{max_shard}}}.tar'
+    ligand_transform_wrapper = lambda _data: _data
+    pocket_transform_wrapper = lambda _data: _data
+    if ligand_transform is not None:
+        def ligand_transform_wrapper(_data):
+            _data['pt']['ligand'] = ligand_transform(_data['pt']['ligand'])
+            return _data
+    if pocket_transform is not None:
+        def pocket_transform_wrapper(_data):
+            _data['pt']['pocket'] = pocket_transform(_data['pt']['pocket'])
+            return _data
+    return (
+        ProteinLigandWebDataset(url, nodesplitter=wds.split_by_node)
+        .decode(wds_decoder)
+        .map(ligand_transform_wrapper)
+        .map(pocket_transform_wrapper)
+        .map(preprocess_wds_item)
+        .with_length(total_size)
+    )

src/data/misc.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# From: https://github.com/biopython/biopython/blob/master/Bio/PDB/Polypeptide.py#L128
+protein_letters_1to3 = {'A': 'ALA', 'C': 'CYS', 'D': 'ASP', 'E': 'GLU', 'F': 'PHE', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE', 'K': 'LYS', 'L': 'LEU', 'M': 'MET', 'N': 'ASN', 'P': 'PRO', 'Q': 'GLN', 'R': 'ARG', 'S': 'SER', 'T': 'THR', 'V': 'VAL', 'W': 'TRP', 'Y': 'TYR'}
+protein_letters_3to1 = {'ALA': 'A', 'CYS': 'C', 'ASP': 'D', 'GLU': 'E', 'PHE': 'F', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I', 'LYS': 'K', 'LEU': 'L', 'MET': 'M', 'ASN': 'N', 'PRO': 'P', 'GLN': 'Q', 'ARG': 'R', 'SER': 'S', 'THR': 'T', 'VAL': 'V', 'TRP': 'W', 'TYR': 'Y'}
+protein_letters_3to1_extended = {'A5N': 'N', 'A8E': 'V', 'A9D': 'S', 'AA3': 'A', 'AA4': 'A', 'AAR': 'R', 'ABA': 'A', 'ACL': 'R', 'AEA': 'C', 'AEI': 'D', 'AFA': 'N', 'AGM': 'R', 'AGQ': 'Y', 'AGT': 'C', 'AHB': 'N', 'AHL': 'R', 'AHO': 'A', 'AHP': 'A', 'AIB': 'A', 'AKL': 'D', 'AKZ': 'D', 'ALA': 'A', 'ALC': 'A', 'ALM': 'A', 'ALN': 'A', 'ALO': 'T', 'ALS': 'A', 'ALT': 'A', 'ALV': 'A', 'ALY': 'K', 'AME': 'M', 'AN6': 'L', 'AN8': 'A', 'API': 'K', 'APK': 'K', 'AR2': 'R', 'AR4': 'E', 'AR7': 'R', 'ARG': 'R', 'ARM': 'R', 'ARO': 'R', 'AS7': 'N', 'ASA': 'D', 'ASB': 'D', 'ASI': 'D', 'ASK': 'D', 'ASL': 'D', 'ASN': 'N', 'ASP': 'D', 'ASQ': 'D', 'AYA': 'A', 'AZH': 'A', 'AZK': 'K', 'AZS': 'S', 'AZY': 'Y', 'AVJ': 'H', 'A30': 'Y', 'A3U': 'F', 'ECC': 'Q', 'ECX': 'C', 'EFC': 'C', 'EHP': 'F', 'ELY': 'K', 'EME': 'E', 'EPM': 'M', 'EPQ': 'Q', 'ESB': 'Y', 'ESC': 'M', 'EXY': 'L', 'EXA': 'K', 'E0Y': 'P', 'E9V': 'H', 'E9M': 'W', 'EJA': 'C', 'EUP': 'T', 'EZY': 'G', 'E9C': 'Y', 'EW6': 'S', 'EXL': 'W', 'I2M': 'I', 'I4G': 'G', 'I58': 'K', 'IAM': 'A', 'IAR': 'R', 'ICY': 'C', 'IEL': 'K', 'IGL': 'G', 'IIL': 'I', 'ILE': 'I', 'ILG': 'E', 'ILM': 'I', 'ILX': 'I', 'ILY': 'K', 'IML': 'I', 'IOR': 'R', 'IPG': 'G', 'IT1': 'K', 'IYR': 'Y', 'IZO': 'M', 'IC0': 'G', 'M0H': 'C', 'M2L': 'K', 'M2S': 'M', 'M30': 'G', 'M3L': 'K', 'M3R': 'K', 'MA ': 'A', 'MAA': 'A', 'MAI': 'R', 'MBQ': 'Y', 'MC1': 'S', 'MCL': 'K', 'MCS': 'C', 'MD3': 'C', 'MD5': 'C', 'MD6': 'G', 'MDF': 'Y', 'ME0': 'M', 'MEA': 'F', 'MEG': 'E', 'MEN': 'N', 'MEQ': 'Q', 'MET': 'M', 'MEU': 'G', 'MFN': 'E', 'MGG': 'R', 'MGN': 'Q', 'MGY': 'G', 'MH1': 'H', 'MH6': 'S', 'MHL': 'L', 'MHO': 'M', 'MHS': 'H', 'MHU': 'F', 'MIR': 'S', 'MIS': 'S', 'MK8': 'L', 'ML3': 'K', 'MLE': 'L', 'MLL': 'L', 'MLY': 'K', 'MLZ': 'K', 'MME': 'M', 'MMO': 'R', 'MNL': 'L', 'MNV': 'V', 'MP8': 'P', 'MPQ': 'G', 'MSA': 'G', 'MSE': 'M', 'MSL': 'M', 'MSO': 'M', 'MT2': 'M', 'MTY': 'Y', 'MVA': 'V', 'MYK': 'K', 'MYN': 'R', 'QCS': 'C', 'QIL': 'I', 'QMM': 'Q', 'QPA': 'C', 'QPH': 'F', 'Q3P': 'K', 'QVA': 'C', 'QX7': 'A', 'Q2E': 'W', 'Q75': 'M', 'Q78': 'F', 'QM8': 'L', 'QMB': 'A', 'QNQ': 'C', 'QNT': 'C', 'QNW': 'C', 'QO2': 'C', 'QO5': 'C', 'QO8': 'C', 'QQ8': 'Q', 'U2X': 'Y', 'U3X': 'F', 'UF0': 'S', 'UGY': 'G', 'UM1': 'A', 'UM2': 'A', 'UMA': 'A', 'UQK': 'A', 'UX8': 'W', 'UXQ': 'F', 'YCM': 'C', 'YOF': 'Y', 'YPR': 'P', 'YPZ': 'Y', 'YTH': 'T', 'Y1V': 'L', 'Y57': 'K', 'YHA': 'K', '200': 'F', '23F': 'F', '23P': 'A', '26B': 'T', '28X': 'T', '2AG': 'A', '2CO': 'C', '2FM': 'M', '2GX': 'F', '2HF': 'H', '2JG': 'S', '2KK': 'K', '2KP': 'K', '2LT': 'Y', '2LU': 'L', '2ML': 'L', '2MR': 'R', '2MT': 'P', '2OR': 'R', '2P0': 'P', '2QZ': 'T', '2R3': 'Y', '2RA': 'A', '2RX': 'S', '2SO': 'H', '2TY': 'Y', '2VA': 'V', '2XA': 'C', '2ZC': 'S', '6CL': 'K', '6CW': 'W', '6GL': 'A', '6HN': 'K', '60F': 'C', '66D': 'I', '6CV': 'A', '6M6': 'C', '6V1': 'C', '6WK': 'C', '6Y9': 'P', '6DN': 'K', 'DA2': 'R', 'DAB': 'A', 'DAH': 'F', 'DBS': 'S', 'DBU': 'T', 'DBY': 'Y', 'DBZ': 'A', 'DC2': 'C', 'DDE': 'H', 'DDZ': 'A', 'DI7': 'Y', 'DHA': 'S', 'DHN': 'V', 'DIR': 'R', 'DLS': 'K', 'DM0': 'K', 'DMH': 'N', 'DMK': 'D', 'DNL': 'K', 'DNP': 'A', 'DNS': 'K', 'DNW': 'A', 'DOH': 'D', 'DON': 'L', 'DP1': 'R', 'DPL': 'P', 'DPP': 'A', 'DPQ': 'Y', 'DYS': 'C', 'D2T': 'D', 'DYA': 'D', 'DJD': 'F', 'DYJ': 'P', 'DV9': 'E', 'H14': 'F', 'H1D': 'M', 'H5M': 'P', 'HAC': 'A', 'HAR': 'R', 'HBN': 'H', 'HCM': 'C', 'HGY': 'G', 'HHI': 'H', 'HIA': 'H', 'HIC': 'H', 'HIP': 'H', 'HIQ': 'H', 'HIS': 'H', 'HL2': 'L', 'HLU': 'L', 'HMR': 'R', 'HNC': 'C', 'HOX': 'F', 'HPC': 'F', 'HPE': 'F', 'HPH': 'F', 'HPQ': 'F', 'HQA': 'A', 'HR7': 'R', 'HRG': 'R', 'HRP': 'W', 'HS8': 'H', 'HS9': 'H', 'HSE': 'S', 'HSK': 'H', 'HSL': 'S', 'HSO': 'H', 'HT7': 'W', 'HTI': 'C', 'HTR': 'W', 'HV5': 'A', 'HVA': 'V', 'HY3': 'P', 'HYI': 'M', 'HYP': 'P', 'HZP': 'P', 'HIX': 'A', 'HSV': 'H', 'HLY': 'K', 'HOO': 'H', 'H7V': 'A', 'L5P': 'K', 'LRK': 'K', 'L3O': 'L', 'LA2': 'K', 'LAA': 'D', 'LAL': 'A', 'LBY': 'K', 'LCK': 'K', 'LCX': 'K', 'LDH': 'K', 'LE1': 'V', 'LED': 'L', 'LEF': 'L', 'LEH': 'L', 'LEM': 'L', 'LEN': 'L', 'LET': 'K', 'LEU': 'L', 'LEX': 'L', 'LGY': 'K', 'LLO': 'K', 'LLP': 'K', 'LLY': 'K', 'LLZ': 'K', 'LME': 'E', 'LMF': 'K', 'LMQ': 'Q', 'LNE': 'L', 'LNM': 'L', 'LP6': 'K', 'LPD': 'P', 'LPG': 'G', 'LPS': 'S', 'LSO': 'K', 'LTR': 'W', 'LVG': 'G', 'LVN': 'V', 'LWY': 'P', 'LYF': 'K', 'LYK': 'K', 'LYM': 'K', 'LYN': 'K', 'LYO': 'K', 'LYP': 'K', 'LYR': 'K', 'LYS': 'K', 'LYU': 'K', 'LYX': 'K', 'LYZ': 'K', 'LAY': 'L', 'LWI': 'F', 'LBZ': 'K', 'P1L': 'C', 'P2Q': 'Y', 'P2Y': 'P', 'P3Q': 'Y', 'PAQ': 'Y', 'PAS': 'D', 'PAT': 'W', 'PBB': 'C', 'PBF': 'F', 'PCA': 'Q', 'PCC': 'P', 'PCS': 'F', 'PE1': 'K', 'PEC': 'C', 'PF5': 'F', 'PFF': 'F', 'PG1': 'S', 'PGY': 'G', 'PHA': 'F', 'PHD': 'D', 'PHE': 'F', 'PHI': 'F', 'PHL': 'F', 'PHM': 'F', 'PKR': 'P', 'PLJ': 'P', 'PM3': 'F', 'POM': 'P', 'PPN': 'F', 'PR3': 'C', 'PR4': 'P', 'PR7': 'P', 'PR9': 'P', 'PRJ': 'P', 'PRK': 'K', 'PRO': 'P', 'PRS': 'P', 'PRV': 'G', 'PSA': 'F', 'PSH': 'H', 'PTH': 'Y', 'PTM': 'Y', 'PTR': 'Y', 'PVH': 'H', 'PXU': 'P', 'PYA': 'A', 'PYH': 'K', 'PYX': 'C', 'PH6': 'P', 'P9S': 'C', 'P5U': 'S', 'POK': 'R', 'T0I': 'Y', 'T11': 'F', 'TAV': 'D', 'TBG': 'V', 'TBM': 'T', 'TCQ': 'Y', 'TCR': 'W', 'TEF': 'F', 'TFQ': 'F', 'TH5': 'T', 'TH6': 'T', 'THC': 'T', 'THR': 'T', 'THZ': 'R', 'TIH': 'A', 'TIS': 'S', 'TLY': 'K', 'TMB': 'T', 'TMD': 'T', 'TNB': 'C', 'TNR': 'S', 'TNY': 'T', 'TOQ': 'W', 'TOX': 'W', 'TPJ': 'P', 'TPK': 'P', 'TPL': 'W', 'TPO': 'T', 'TPQ': 'Y', 'TQI': 'W', 'TQQ': 'W', 'TQZ': 'C', 'TRF': 'W', 'TRG': 'K', 'TRN': 'W', 'TRO': 'W', 'TRP': 'W', 'TRQ': 'W', 'TRW': 'W', 'TRX': 'W', 'TRY': 'W', 'TS9': 'I', 'TSY': 'C', 'TTQ': 'W', 'TTS': 'Y', 'TXY': 'Y', 'TY1': 'Y', 'TY2': 'Y', 'TY3': 'Y', 'TY5': 'Y', 'TY8': 'Y', 'TY9': 'Y', 'TYB': 'Y', 'TYC': 'Y', 'TYE': 'Y', 'TYI': 'Y', 'TYJ': 'Y', 'TYN': 'Y', 'TYO': 'Y', 'TYQ': 'Y', 'TYR': 'Y', 'TYS': 'Y', 'TYT': 'Y', 'TYW': 'Y', 'TYY': 'Y', 'T8L': 'T', 'T9E': 'T', 'TNQ': 'W', 'TSQ': 'F', 'TGH': 'W', 'X2W': 'E', 'XCN': 'C', 'XPR': 'P', 'XSN': 'N', 'XW1': 'A', 'XX1': 'K', 'XYC': 'A', 'XA6': 'F', '11Q': 'P', '11W': 'E', '12L': 'P', '12X': 'P', '12Y': 'P', '143': 'C', '1AC': 'A', '1L1': 'A', '1OP': 'Y', '1PA': 'F', '1PI': 'A', '1TQ': 'W', '1TY': 'Y', '1X6': 'S', '56A': 'H', '5AB': 'A', '5CS': 'C', '5CW': 'W', '5HP': 'E', '5OH': 'A', '5PG': 'G', '51T': 'Y', '54C': 'W', '5CR': 'F', '5CT': 'K', '5FQ': 'A', '5GM': 'I', '5JP': 'S', '5T3': 'K', '5MW': 'K', '5OW': 'K', '5R5': 'S', '5VV': 'N', '5XU': 'A', '55I': 'F', '999': 'D', '9DN': 'N', '9NE': 'E', '9NF': 'F', '9NR': 'R', '9NV': 'V', '9E7': 'K', '9KP': 'K', '9WV': 'A', '9TR': 'K', '9TU': 'K', '9TX': 'K', '9U0': 'K', '9IJ': 'F', 'B1F': 'F', 'B27': 'T', 'B2A': 'A', 'B2F': 'F', 'B2I': 'I', 'B2V': 'V', 'B3A': 'A', 'B3D': 'D', 'B3E': 'E', 'B3K': 'K', 'B3U': 'H', 'B3X': 'N', 'B3Y': 'Y', 'BB6': 'C', 'BB7': 'C', 'BB8': 'F', 'BB9': 'C', 'BBC': 'C', 'BCS': 'C', 'BCX': 'C', 'BFD': 'D', 'BG1': 'S', 'BH2': 'D', 'BHD': 'D', 'BIF': 'F', 'BIU': 'I', 'BL2': 'L', 'BLE': 'L', 'BLY': 'K', 'BMT': 'T', 'BNN': 'F', 'BOR': 'R', 'BP5': 'A', 'BPE': 'C', 'BSE': 'S', 'BTA': 'L', 'BTC': 'C', 'BTK': 'K', 'BTR': 'W', 'BUC': 'C', 'BUG': 'V', 'BYR': 'Y', 'BWV': 'R', 'BWB': 'S', 'BXT': 'S', 'F2F': 'F', 'F2Y': 'Y', 'FAK': 'K', 'FB5': 'A', 'FB6': 'A', 'FC0': 'F', 'FCL': 'F', 'FDL': 'K', 'FFM': 'C', 'FGL': 'G', 'FGP': 'S', 'FH7': 'K', 'FHL': 'K', 'FHO': 'K', 'FIO': 'R', 'FLA': 'A', 'FLE': 'L', 'FLT': 'Y', 'FME': 'M', 'FOE': 'C', 'FP9': 'P', 'FPK': 'P', 'FT6': 'W', 'FTR': 'W', 'FTY': 'Y', 'FVA': 'V', 'FZN': 'K', 'FY3': 'Y', 'F7W': 'W', 'FY2': 'Y', 'FQA': 'K', 'F7Q': 'Y', 'FF9': 'K', 'FL6': 'D', 'JJJ': 'C', 'JJK': 'C', 'JJL': 'C', 'JLP': 'K', 'J3D': 'C', 'J9Y': 'R', 'J8W': 'S', 'JKH': 'P', 'N10': 'S', 'N7P': 'P', 'NA8': 'A', 'NAL': 'A', 'NAM': 'A', 'NBQ': 'Y', 'NC1': 'S', 'NCB': 'A', 'NEM': 'H', 'NEP': 'H', 'NFA': 'F', 'NIY': 'Y', 'NLB': 'L', 'NLE': 'L', 'NLN': 'L', 'NLO': 'L', 'NLP': 'L', 'NLQ': 'Q', 'NLY': 'G', 'NMC': 'G', 'NMM': 'R', 'NNH': 'R', 'NOT': 'L', 'NPH': 'C', 'NPI': 'A', 'NTR': 'Y', 'NTY': 'Y', 'NVA': 'V', 'NWD': 'A', 'NYB': 'C', 'NYS': 'C', 'NZH': 'H', 'N80': 'P', 'NZC': 'T', 'NLW': 'L', 'N0A': 'F', 'N9P': 'A', 'N65': 'K', 'R1A': 'C', 'R4K': 'W', 'RE0': 'W', 'RE3': 'W', 'RGL': 'R', 'RGP': 'E', 'RT0': 'P', 'RVX': 'S', 'RZ4': 'S', 'RPI': 'R', 'RVJ': 'A', 'VAD': 'V', 'VAF': 'V', 'VAH': 'V', 'VAI': 'V', 'VAL': 'V', 'VB1': 'K', 'VH0': 'P', 'VR0': 'R', 'V44': 'C', 'V61': 'F', 'VPV': 'K', 'V5N': 'H', 'V7T': 'K', 'Z01': 'A', 'Z3E': 'T', 'Z70': 'H', 'ZBZ': 'C', 'ZCL': 'F', 'ZU0': 'T', 'ZYJ': 'P', 'ZYK': 'P', 'ZZD': 'C', 'ZZJ': 'A', 'ZIQ': 'W', 'ZPO': 'P', 'ZDJ': 'Y', 'ZT1': 'K', '30V': 'C', '31Q': 'C', '33S': 'F', '33W': 'A', '34E': 'V', '3AH': 'H', '3BY': 'P', '3CF': 'F', '3CT': 'Y', '3GA': 'A', '3GL': 'E', '3MD': 'D', '3MY': 'Y', '3NF': 'Y', '3O3': 'E', '3PX': 'P', '3QN': 'K', '3TT': 'P', '3XH': 'G', '3YM': 'Y', '3WS': 'A', '3WX': 'P', '3X9': 'C', '3ZH': 'H', '7JA': 'I', '73C': 'S', '73N': 'R', '73O': 'Y', '73P': 'K', '74P': 'K', '7N8': 'F', '7O5': 'A', '7XC': 'F', '7ID': 'D', '7OZ': 'A', 'C1S': 'C', 'C1T': 'C', 'C1X': 'K', 'C22': 'A', 'C3Y': 'C', 'C4R': 'C', 'C5C': 'C', 'C6C': 'C', 'CAF': 'C', 'CAS': 'C', 'CAY': 'C', 'CCS': 'C', 'CEA': 'C', 'CGA': 'E', 'CGU': 'E', 'CGV': 'C', 'CHP': 'G', 'CIR': 'R', 'CLE': 'L', 'CLG': 'K', 'CLH': 'K', 'CME': 'C', 'CMH': 'C', 'CML': 'C', 'CMT': 'C', 'CR5': 'G', 'CS0': 'C', 'CS1': 'C', 'CS3': 'C', 'CS4': 'C', 'CSA': 'C', 'CSB': 'C', 'CSD': 'C', 'CSE': 'C', 'CSJ': 'C', 'CSO': 'C', 'CSP': 'C', 'CSR': 'C', 'CSS': 'C', 'CSU': 'C', 'CSW': 'C', 'CSX': 'C', 'CSZ': 'C', 'CTE': 'W', 'CTH': 'T', 'CWD': 'A', 'CWR': 'S', 'CXM': 'M', 'CY0': 'C', 'CY1': 'C', 'CY3': 'C', 'CY4': 'C', 'CYA': 'C', 'CYD': 'C', 'CYF': 'C', 'CYG': 'C', 'CYJ': 'K', 'CYM': 'C', 'CYQ': 'C', 'CYR': 'C', 'CYS': 'C', 'CYW': 'C', 'CZ2': 'C', 'CZZ': 'C', 'CG6': 'C', 'C1J': 'R', 'C4G': 'R', 'C67': 'R', 'C6D': 'R', 'CE7': 'N', 'CZS': 'A', 'G01': 'E', 'G8M': 'E', 'GAU': 'E', 'GEE': 'G', 'GFT': 'S', 'GHC': 'E', 'GHG': 'Q', 'GHW': 'E', 'GL3': 'G', 'GLH': 'Q', 'GLJ': 'E', 'GLK': 'E', 'GLN': 'Q', 'GLQ': 'E', 'GLU': 'E', 'GLY': 'G', 'GLZ': 'G', 'GMA': 'E', 'GME': 'E', 'GNC': 'Q', 'GPL': 'K', 'GSC': 'G', 'GSU': 'E', 'GT9': 'C', 'GVL': 'S', 'G3M': 'R', 'G5G': 'L', 'G1X': 'Y', 'G8X': 'P', 'K1R': 'C', 'KBE': 'K', 'KCX': 'K', 'KFP': 'K', 'KGC': 'K', 'KNB': 'A', 'KOR': 'M', 'KPI': 'K', 'KPY': 'K', 'KST': 'K', 'KYN': 'W', 'KYQ': 'K', 'KCR': 'K', 'KPF': 'K', 'K5L': 'S', 'KEO': 'K', 'KHB': 'K', 'KKD': 'D', 'K5H': 'C', 'K7K': 'S', 'OAR': 'R', 'OAS': 'S', 'OBS': 'K', 'OCS': 'C', 'OCY': 'C', 'OHI': 'H', 'OHS': 'D', 'OLD': 'H', 'OLT': 'T', 'OLZ': 'S', 'OMH': 'S', 'OMT': 'M', 'OMX': 'Y', 'OMY': 'Y', 'ONH': 'A', 'ORN': 'A', 'ORQ': 'R', 'OSE': 'S', 'OTH': 'T', 'OXX': 'D', 'OYL': 'H', 'O7A': 'T', 'O7D': 'W', 'O7G': 'V', 'O2E': 'S', 'O6H': 'W', 'OZW': 'F', 'S12': 'S', 'S1H': 'S', 'S2C': 'C', 'S2P': 'A', 'SAC': 'S', 'SAH': 'C', 'SAR': 'G', 'SBG': 'S', 'SBL': 'S', 'SCH': 'C', 'SCS': 'C', 'SCY': 'C', 'SD4': 'N', 'SDB': 'S', 'SDP': 'S', 'SEB': 'S', 'SEE': 'S', 'SEG': 'A', 'SEL': 'S', 'SEM': 'S', 'SEN': 'S', 'SEP': 'S', 'SER': 'S', 'SET': 'S', 'SGB': 'S', 'SHC': 'C', 'SHP': 'G', 'SHR': 'K', 'SIB': 'C', 'SLL': 'K', 'SLZ': 'K', 'SMC': 'C', 'SME': 'M', 'SMF': 'F', 'SNC': 'C', 'SNN': 'N', 'SOY': 'S', 'SRZ': 'S', 'STY': 'Y', 'SUN': 'S', 'SVA': 'S', 'SVV': 'S', 'SVW': 'S', 'SVX': 'S', 'SVY': 'S', 'SVZ': 'S', 'SXE': 'S', 'SKH': 'K', 'SNM': 'S', 'SNK': 'H', 'SWW': 'S', 'WFP': 'F', 'WLU': 'L', 'WPA': 'F', 'WRP': 'W', 'WVL': 'V', '02K': 'A', '02L': 'N', '02O': 'A', '02Y': 'A', '033': 'V', '037': 'P', '03Y': 'C', '04U': 'P', '04V': 'P', '05N': 'P', '07O': 'C', '0A0': 'D', '0A1': 'Y', '0A2': 'K', '0A8': 'C', '0A9': 'F', '0AA': 'V', '0AB': 'V', '0AC': 'G', '0AF': 'W', '0AG': 'L', '0AH': 'S', '0AK': 'D', '0AR': 'R', '0BN': 'F', '0CS': 'A', '0E5': 'T', '0EA': 'Y', '0FL': 'A', '0LF': 'P', '0NC': 'A', '0PR': 'Y', '0QL': 'C', '0TD': 'D', '0UO': 'W', '0WZ': 'Y', '0X9': 'R', '0Y8': 'P', '4AF': 'F', '4AR': 'R', '4AW': 'W', '4BF': 'F', '4CF': 'F', '4CY': 'M', '4DP': 'W', '4FB': 'P', '4FW': 'W', '4HL': 'Y', '4HT': 'W', '4IN': 'W', '4MM': 'M', '4PH': 'F', '4U7': 'A', '41H': 'F', '41Q': 'N', '42Y': 'S', '432': 'S', '45F': 'P', '4AK': 'K', '4D4': 'R', '4GJ': 'C', '4KY': 'P', '4L0': 'P', '4LZ': 'Y', '4N7': 'P', '4N8': 'P', '4N9': 'P', '4OG': 'W', '4OU': 'F', '4OV': 'S', '4OZ': 'S', '4PQ': 'W', '4SJ': 'F', '4WQ': 'A', '4HH': 'S', '4HJ': 'S', '4J4': 'C', '4J5': 'R', '4II': 'F', '4VI': 'R', '823': 'N', '8SP': 'S', '8AY': 'A'}
+def is_aa(residue, standard=False):
+    if not isinstance(residue, str):
+        residue = f"{residue.get_resname():<3s}"
+    residue = residue.upper()
+    if standard:
+        return residue in protein_letters_3to1
+    else:
+        return residue in protein_letters_3to1_extended

src/data/molecule_builder.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from rdkit import Chem
+from src import constants
+def remove_dummy_atoms(rdmol, sanitize=False):
+    # find exit atoms to be removed
+    dummy_inds = []
+    for a in rdmol.GetAtoms():
+        if a.GetSymbol() == '*':
+            dummy_inds.append(a.GetIdx())
+    dummy_inds = sorted(dummy_inds, reverse=True)
+    new_mol = Chem.EditableMol(rdmol)
+    for idx in dummy_inds:
+        new_mol.RemoveAtom(idx)
+    new_mol = new_mol.GetMol()
+    if sanitize:
+        Chem.SanitizeMol(new_mol)
+    return new_mol
+def build_molecule(coords, atom_types, bonds=None, bond_types=None,
+                   atom_props=None, atom_decoder=None, bond_decoder=None):
+    """
+    Build RDKit molecule with given bonds
+    :param coords: N x 3
+    :param atom_types: N
+    :param bonds: 2 x N_bonds
+    :param bond_types: N_bonds
+    :param atom_props: Dict, key: property name, value: list of float values (N,)
+    :param atom_decoder: list
+    :param bond_decoder: list
+    :return: RDKit molecule
+    """
+    if atom_decoder is None:
+        atom_decoder = constants.atom_decoder
+    if bond_decoder is None:
+        bond_decoder = constants.bond_decoder
+    assert len(coords) == len(atom_types)
+    assert bonds is None or bonds.size(1) == len(bond_types)
+    mol = Chem.RWMol()
+    for i, atom in enumerate(atom_types):
+        element = atom_decoder[atom.item()]
+        charge = None
+        explicitHs = None
+        if len(element) > 1 and element.endswith('H'):
+            explicitHs = 1
+            element = element[:-1]
+        elif element.endswith('+'):
+            charge = 1
+            element = element[:-1]
+        elif element.endswith('-'):
+            charge = -1
+            element = element[:-1]
+        if element == 'NOATOM':
+            # element = 'Xe'  # debug
+            element = '*'
+        a = Chem.Atom(element)
+        if explicitHs is not None:
+            a.SetNumExplicitHs(explicitHs)
+        if charge is not None:
+            a.SetFormalCharge(charge)
+        if atom_props is not None:
+            for k, vals in atom_props.items():
+                a.SetDoubleProp(k, vals[i].item())
+        mol.AddAtom(a)
+    # add coordinates
+    conf = Chem.Conformer(mol.GetNumAtoms())
+    for i in range(mol.GetNumAtoms()):
+        conf.SetAtomPosition(i, (coords[i, 0].item(),
+                                 coords[i, 1].item(),
+                                 coords[i, 2].item()))
+    mol.AddConformer(conf)
+    # add bonds
+    if bonds is not None:
+        for bond, bond_type in zip(bonds.T, bond_types):
+            bond_type = bond_decoder[bond_type]
+            src = bond[0].item()
+            dst = bond[1].item()
+            # try:
+            if bond_type == 'NOBOND' or mol.GetAtomWithIdx(src).GetSymbol() == '*' or mol.GetAtomWithIdx(dst).GetSymbol() == '*':
+                continue
+            # except RuntimeError:
+            #     from pdb import set_trace; set_trace()
+            if mol.GetBondBetweenAtoms(src, dst) is not None:
+                assert mol.GetBondBetweenAtoms(src, dst).GetBondType() == bond_type, \
+                    "Trying to assign two different types to the same bond."
+                continue
+            if bond_type is None or src == dst:
+                continue
+            mol.AddBond(src, dst, bond_type)
+    mol = remove_dummy_atoms(mol, sanitize=False)
+    return mol

src/data/nerf.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+Natural Extension Reference Frame (NERF)
+Inspiration for parallel reconstruction:
+https://github.com/EleutherAI/mp_nerf and references therein
+For atom names, see also:
+https://www.ccpn.ac.uk/manual/v3/NEFAtomNames.html
+References:
+- https://onlinelibrary.wiley.com/doi/10.1002/jcc.20237 (NERF)
+- https://onlinelibrary.wiley.com/doi/10.1002/jcc.26768 (for code)
+"""
+import warnings
+import torch
+import numpy as np
+from src.data.misc import protein_letters_3to1
+from src.constants import aa_atom_index, aa_atom_mask, aa_nerf_indices, aa_chi_indices, aa_chi_anchor_atom
+# https://github.com/EleutherAI/mp_nerf/blob/master/mp_nerf/utils.py
+def get_dihedral(c1, c2, c3, c4):
+    """ Returns the dihedral angle in radians.
+        Will use atan2 formula from:
+        https://en.wikipedia.org/wiki/Dihedral_angle#In_polymer_physics
+        Inputs:
+        * c1: (batch, 3) or (3,)
+        * c2: (batch, 3) or (3,)
+        * c3: (batch, 3) or (3,)
+        * c4: (batch, 3) or (3,)
+    """
+    u1 = c2 - c1
+    u2 = c3 - c2
+    u3 = c4 - c3
+    return torch.atan2( ( (torch.norm(u2, dim=-1, keepdim=True) * u1) * torch.cross(u2,u3, dim=-1) ).sum(dim=-1) ,
+                        (  torch.cross(u1,u2, dim=-1) * torch.cross(u2, u3, dim=-1) ).sum(dim=-1) )
+# https://github.com/EleutherAI/mp_nerf/blob/master/mp_nerf/utils.py
+def get_angle(c1, c2, c3):
+    """ Returns the angle in radians.
+        Inputs:
+        * c1: (batch, 3) or (3,)
+        * c2: (batch, 3) or (3,)
+        * c3: (batch, 3) or (3,)
+    """
+    u1 = c2 - c1
+    u2 = c3 - c2
+    # dont use acos since norms involved.
+    # better use atan2 formula: atan2(cross, dot) from here:
+    # https://johnblackburne.blogspot.com/2012/05/angle-between-two-3d-vectors.html
+    # add a minus since we want the angle in reversed order - sidechainnet issues
+    return torch.atan2( torch.norm(torch.cross(u1,u2, dim=-1), dim=-1),
+                        -(u1*u2).sum(dim=-1) )
+def get_nerf_params(biopython_residue):
+    aa = protein_letters_3to1[biopython_residue.get_resname()]
+    # Basic mask and index tensors
+    atom_mask = torch.tensor(aa_atom_mask[aa], dtype=bool)
+    nerf_indices = torch.tensor(aa_nerf_indices[aa], dtype=int)
+    chi_indices = torch.tensor(aa_chi_indices[aa], dtype=int)
+    fixed_coord = torch.zeros((5, 3))
+    residue_coords = torch.zeros((14, 3))  # only required to compute internal coordinates during pre-processing
+    atom_found = torch.zeros_like(atom_mask)
+    for atom in biopython_residue.get_atoms():
+        try:
+            idx = aa_atom_index[aa][atom.get_name()]
+            atom_found[idx] = True
+        except KeyError:
+            warnings.warn(f"{atom.get_name()} not found")
+            continue
+        residue_coords[idx, :] = torch.from_numpy(atom.get_coord())
+        if atom.get_name() in ['N', 'CA', 'C', 'O', 'CB']:
+            fixed_coord[idx, :] = torch.from_numpy(atom.get_coord())
+    # Determine chi angles
+    chi = torch.zeros(6)  # the last chi angle is a dummy and should always be zero
+    for chi_idx, anchor in aa_chi_anchor_atom[aa].items():
+        idx_a = nerf_indices[anchor, 2]
+        idx_b = nerf_indices[anchor, 1]
+        idx_c = nerf_indices[anchor, 0]
+        coords_a = residue_coords[idx_a, :]
+        coords_b = residue_coords[idx_b, :]
+        coords_c = residue_coords[idx_c, :]
+        coords_d = residue_coords[anchor, :]
+        chi[chi_idx] = get_dihedral(coords_a, coords_b, coords_c, coords_d)
+    # Compute remaining internal coordinates
+    # (parallel version)
+    idx_a = nerf_indices[:, 2]
+    idx_b = nerf_indices[:, 1]
+    idx_c = nerf_indices[:, 0]
+    # update atom mask
+    # remove atoms for which one or several parameters are missing/incorrect
+    _atom_mask = atom_mask & atom_found & atom_found[idx_a] & atom_found[idx_b] & atom_found[idx_c]
+    if not torch.all(_atom_mask == atom_mask):
+        warnings.warn("Some atoms are missing for NERF reconstruction")
+    atom_mask = _atom_mask
+    coords_a = residue_coords[idx_a]
+    coords_b = residue_coords[idx_b]
+    coords_c = residue_coords[idx_c]
+    coords_d = residue_coords
+    length = torch.norm(coords_d - coords_c, dim=-1)
+    theta = get_angle(coords_b, coords_c, coords_d)
+    ddihedral = get_dihedral(coords_a, coords_b, coords_c, coords_d)
+    # subtract chi angles from dihedrals
+    ddihedral = ddihedral - chi[chi_indices]
+    #     # (serial version)
+    #     length = torch.zeros(14)
+    #     theta = torch.zeros(14)
+    #     ddihedral = torch.zeros(14)
+    #     for i in range(5, 14):
+    #         if not atom_mask[i]: # atom doesn't exist
+    #             continue
+    #         idx_a = nerf_indices[i, 2]
+    #         idx_b = nerf_indices[i, 1]
+    #         idx_c = nerf_indices[i, 0]
+    #         coords_a = residue_coords[idx_a]
+    #         coords_b = residue_coords[idx_b]
+    #         coords_c = residue_coords[idx_c]
+    #         coords_d = residue_coords[i]
+    #         length[i] = torch.norm(coords_d - coords_c, dim=-1)
+    #         theta[i] = get_angle(coords_b, coords_c, coords_d)
+    #         ddihedral[i] = get_dihedral(coords_a, coords_b, coords_c, coords_d)
+    #         # subtract chi angles from dihedrals
+    #         ddihedral[i] = ddihedral[i] - chi[chi_indices[i]]
+    return {
+        'fixed_coord': fixed_coord,
+        'atom_mask': atom_mask,
+        'nerf_indices': nerf_indices,
+        'length': length,
+        'theta': theta,
+        'chi': chi,
+        'ddihedral': ddihedral,
+        'chi_indices': chi_indices,
+    }
+# https://github.com/EleutherAI/mp_nerf/blob/master/mp_nerf/massive_pnerf.py#L38C1-L65C67
+def mp_nerf_torch(a, b, c, l, theta, chi):
+    """ Custom Natural extension of Reference Frame.
+        Inputs:
+        * a: (batch, 3) or (3,). point(s) of the plane, not connected to d
+        * b: (batch, 3) or (3,). point(s) of the plane, not connected to d
+        * c: (batch, 3) or (3,). point(s) of the plane, connected to d
+        * theta: (batch,) or (float).  angle(s) between b-c-d
+        * chi: (batch,) or float. dihedral angle(s) between the a-b-c and b-c-d planes
+        Outputs: d (batch, 3) or (float). the next point in the sequence, linked to c
+    """
+    # safety check
+    if not ( (-np.pi <= theta) * (theta <= np.pi) ).all().item():
+        raise ValueError(f"theta(s) must be in radians and in [-pi, pi]. theta(s) = {theta}")
+    # calc vecs
+    ba = b-a
+    cb = c-b
+    # calc rotation matrix. based on plane normals and normalized
+    n_plane  = torch.cross(ba, cb, dim=-1)
+    n_plane_ = torch.cross(n_plane, cb, dim=-1)
+    rotate   = torch.stack([cb, n_plane_, n_plane], dim=-1)
+    rotate  /= torch.norm(rotate, dim=-2, keepdim=True)
+    # calc proto point, rotate. add (-1 for sidechainnet convention)
+    # https://github.com/jonathanking/sidechainnet/issues/14
+    d = torch.stack([-torch.cos(theta),
+                     torch.sin(theta) * torch.cos(chi),
+                     torch.sin(theta) * torch.sin(chi)], dim=-1).unsqueeze(-1)
+    # extend base point, set length
+    return c + l.unsqueeze(-1) * torch.matmul(rotate, d).squeeze()
+# inspired by: https://github.com/EleutherAI/mp_nerf/blob/master/mp_nerf/proteins.py#L323C5-L344C65
+def ic_to_coords(fixed_coord, atom_mask, nerf_indices, length, theta, chi, ddihedral, chi_indices):
+    """
+    Run NERF in parallel for all residues.
+    :param fixed_coord: (L, 5, 3) coordinates of (N, CA, C, O, CB) atoms, they don't depend on chi angles
+    :param atom_mask: (L, 14) indicates whether atom exists in this residue
+    :param nerf_indices: (L, 14, 3) indices of the three previous atoms ({c, b, a} for the NERF algorithm)
+    :param length: (L, 14) bond length between this and previous atom
+    :param theta: (L, 14) angle between this and previous two atoms
+    :param chi: (L, 6) values of the 5 rotatable bonds, plus zero in last column
+    :param ddihedral: (L, 14) angle offset to which chi is added
+    :param chi_indices: (L, 14) indexes into the chi array
+    :returns: (L, 14, 3) tensor with all coordinates, non-existing atoms are assigned CA coords
+    """
+    if not torch.all(chi[:, 5] == 0):
+        chi[:, 5] = 0.0
+        warnings.warn("Last column of 'chi' tensor should be zero. Overriding values.")
+    assert torch.all(chi[:, 5] == 0)
+    L, device = fixed_coord.size(0), fixed_coord.device
+    coords = torch.zeros((L, 14, 3), device=device)
+    coords[:, :5, :] = fixed_coord
+    for i in range(5, 14):
+        level_mask = atom_mask[:, i]
+        #     level_mask = torch.ones(len(atom_mask), dtype=bool)
+        length_i = length[level_mask, i]
+        theta_i = theta[level_mask, i]
+        #     dihedral_i = dihedral[level_mask, i]
+        dihedral_i = chi[level_mask, chi_indices[level_mask, i]] + ddihedral[level_mask, i]
+        idx_a = nerf_indices[level_mask, i, 2]
+        idx_b = nerf_indices[level_mask, i, 1]
+        idx_c = nerf_indices[level_mask, i, 0]
+        coords[level_mask, i] = mp_nerf_torch(coords[level_mask, idx_a],
+                                              coords[level_mask, idx_b],
+                                              coords[level_mask, idx_c],
+                                              length_i,
+                                              theta_i,
+                                              dihedral_i)
+    if coords.isnan().any():
+        warnings.warn("Side chain reconstruction error. Removing affected atoms...")
+        # mask out affected atoms
+        m, n, _ = torch.where(coords.isnan())
+        atom_mask[m, n] = False
+        coords[m, n, :] = 0.0
+    # replace non-existing atom coords with CA coords (TODO: don't hard-code CA index)
+    coords = atom_mask.unsqueeze(-1) * coords + \
+             (~atom_mask.unsqueeze(2)) * coords[:, 1, :].unsqueeze(1)
+    return coords

src/data/normal_modes.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import warnings
+import numpy as np
+import prody
+prody.confProDy(verbosity='none')
+from prody import parsePDB, ANM
+def pdb_to_normal_modes(pdb_file, num_modes=5, nmax=5000):
+    """
+    Compute normal modes for a PDB file using an Anisotropic Network Model (ANM)
+    http://prody.csb.pitt.edu/tutorials/enm_analysis/anm.html (accessed 01/11/2023)
+    """
+    protein = parsePDB(pdb_file, model=1).select('calpha')
+    if len(protein) > nmax:
+        warnings.warn("Protein is too big. Returning zeros...")
+        eig_vecs = np.zeros((len(protein), 3, num_modes))
+    else:
+        # build Hessian
+        anm = ANM('ANM analysis')
+        anm.buildHessian(protein, cutoff=15.0, gamma=1.0)
+        # calculate normal modes
+        anm.calcModes(num_modes, zeros=False)
+        # only use slowest modes
+        eig_vecs = anm.getEigvecs()  # shape: (num_atoms * 3, num_modes)
+        eig_vecs = eig_vecs.reshape(len(protein), 3, num_modes)
+        # eig_vals = anm.getEigvals()  # shape: (num_modes,)
+    nm_dict = {}
+    for atom, nm_vec in zip(protein, eig_vecs):
+        chain = atom.getChid()
+        resi = atom.getResnum()
+        name = atom.getName()
+        nm_dict[(chain, resi, name)] = nm_vec.T
+    return nm_dict
+if __name__ == "__main__":
+    import argparse
+    from pathlib import Path
+    import torch
+    from tqdm import tqdm
+    parser = argparse.ArgumentParser()
+    parser.add_argument('basedir', type=Path)
+    parser.add_argument('--outfile', type=Path, default=None)
+    args = parser.parse_args()
+    # Read data split
+    split_path = Path(args.basedir, 'split_by_name.pt')
+    data_split = torch.load(split_path)
+    pockets = [x[0] for split in data_split.values() for x in split]
+    all_normal_modes = {}
+    for p in tqdm(pockets):
+        pdb_file = Path(args.basedir, 'crossdocked_pocket10', p)
+        try:
+            nm_dict = pdb_to_normal_modes(str(pdb_file))
+            all_normal_modes[p] = nm_dict
+        except AttributeError as e:
+            warnings.warn(str(e))
+    np.save(args.outfile, all_normal_modes)

src/data/postprocessing.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import warnings
+from rdkit import Chem
+from rdkit.Chem.rdForceFieldHelpers import UFFOptimizeMolecule, UFFHasAllMoleculeParams
+from src.data import sanifix
+def uff_relax(mol, max_iter=200):
+    """
+    Uses RDKit's universal force field (UFF) implementation to optimize a
+    molecule.
+    """
+    if not UFFHasAllMoleculeParams(mol):
+        warnings.warn('UFF parameters not available for all atoms. '
+                      'Returning None.')
+        return None
+    try:
+        more_iterations_required = UFFOptimizeMolecule(mol, maxIters=max_iter)
+        if more_iterations_required:
+            warnings.warn(f'Maximum number of FF iterations reached. '
+                          f'Returning molecule after {max_iter} relaxation steps.')
+    except RuntimeError:
+        return None
+    return mol
+def add_hydrogens(rdmol):
+    return Chem.AddHs(rdmol, addCoords=(len(rdmol.GetConformers()) > 0))
+def get_largest_fragment(rdmol):
+    mol_frags = Chem.GetMolFrags(rdmol, asMols=True, sanitizeFrags=False)
+    largest_frag = max(mol_frags, default=rdmol, key=lambda m: m.GetNumAtoms())
+    # try:
+    #     Chem.SanitizeMol(largest_frag)
+    # except ValueError:
+    #     return None
+    return largest_frag
+def process_all(rdmol, largest_frag=True, adjust_aromatic_Ns=True, relax_iter=0):
+    """
+    Apply all filters and post-processing steps. Returns a new molecule.
+    Returns:
+        RDKit molecule or None if it does not pass the filters or processing
+        fails
+    """
+    # Only consider non-trivial molecules
+    if rdmol.GetNumAtoms() < 1:
+        return None
+    # Create a copy
+    mol = Chem.Mol(rdmol)
+    # try:
+    #     Chem.SanitizeMol(mol)
+    # except ValueError:
+    #     warnings.warn('Sanitization failed. Returning None.')
+    #     return None
+    if largest_frag:
+        mol = get_largest_fragment(mol)
+        # if mol is None:
+        #     return None
+    if adjust_aromatic_Ns:
+        mol = sanifix.fix_mol(mol)
+        if mol is None:
+            return None
+    # if add_hydrogens:
+    #     mol = add_hydrogens(mol)
+    if relax_iter > 0:
+        mol = uff_relax(mol, relax_iter)
+        if mol is None:
+            return None
+    try:
+        Chem.SanitizeMol(mol)
+    except ValueError:
+        warnings.warn('Sanitization failed. Returning None.')
+        return None
+    return mol

src/data/process_crossdocked.py ADDED Viewed

	@@ -0,0 +1,176 @@

+from pathlib import Path
+from time import time
+import argparse
+import shutil
+import random
+import yaml
+from collections import defaultdict
+import torch
+from tqdm import tqdm
+import numpy as np
+from Bio.PDB import PDBParser
+from rdkit import Chem
+import sys
+basedir = Path(__file__).resolve().parent.parent.parent
+sys.path.append(str(basedir))
+from src.data.data_utils import process_raw_pair, get_n_nodes, get_type_histogram
+from src.data.data_utils import rdmol_to_smiles
+from src.constants import atom_encoder, bond_encoder
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('basedir', type=Path)
+    parser.add_argument('--outdir', type=Path, default=None)
+    parser.add_argument('--split_path', type=Path, default=None)
+    parser.add_argument('--pocket', type=str, default='CA+',
+                        choices=['side_chain_bead', 'CA+'])
+    parser.add_argument('--random_seed', type=int, default=42)
+    parser.add_argument('--val_size', type=int, default=100)
+    parser.add_argument('--normal_modes', action='store_true')
+    parser.add_argument('--flex', action='store_true')
+    parser.add_argument('--toy', action='store_true')
+    args = parser.parse_args()
+    random.seed(args.random_seed)
+    datadir = args.basedir / 'crossdocked_pocket10/'
+    # Make output directory
+    dirname = f"processed_crossdocked_{args.pocket}"
+    if args.flex:
+        dirname += '_flex'
+    if args.normal_modes:
+        dirname += '_nma'
+    if args.toy:
+        dirname += '_toy'
+    processed_dir = Path(args.basedir, dirname) if args.outdir is None else args.outdir
+    processed_dir.mkdir(parents=True)
+    # Read data split
+    split_path = Path(args.basedir, 'split_by_name.pt') if args.split_path is None else args.split_path
+    data_split = torch.load(split_path)
+    # If there is no validation set, copy training examples (the validation set
+    # is not very important in this application)
+    if 'val' not in data_split:
+        random.shuffle(data_split['train'])
+        data_split['val'] = data_split['train'][-args.val_size:]
+        data_split['train'] = data_split['train'][:-args.val_size]
+    if args.toy:
+        data_split['train'] = random.sample(data_split['train'], 100)
+    failed = {}
+    train_smiles = []
+    n_samples_after = {}
+    for split in data_split.keys():
+        print(f"Processing {split} dataset...")
+        ligands = defaultdict(list)
+        pockets = defaultdict(list)
+        tic = time()
+        pbar = tqdm(data_split[split])
+        for pocket_fn, ligand_fn in pbar:
+            pbar.set_description(f'#failed: {len(failed)}')
+            sdffile = datadir / f'{ligand_fn}'
+            pdbfile = datadir / f'{pocket_fn}'
+            try:
+                pdb_model = PDBParser(QUIET=True).get_structure('', pdbfile)[0]
+                rdmol = Chem.SDMolSupplier(str(sdffile))[0]
+                ligand, pocket = process_raw_pair(
+                    pdb_model, rdmol, pocket_representation=args.pocket,
+                    compute_nerf_params=args.flex, compute_bb_frames=args.flex,
+                    nma_input=pdbfile if args.normal_modes else None)
+            except (KeyError, AssertionError, FileNotFoundError, IndexError,
+                    ValueError, AttributeError) as e:
+                failed[(split, sdffile, pdbfile)] = (type(e).__name__, str(e))
+                continue
+            nerf_keys = ['fixed_coord', 'atom_mask', 'nerf_indices', 'length', 'theta', 'chi', 'ddihedral', 'chi_indices']
+            for k in ['x', 'one_hot', 'bonds', 'bond_one_hot', 'v', 'nma_vec'] + nerf_keys + ['axis_angle']:
+                if k in ligand:
+                    ligands[k].append(ligand[k])
+                if k in pocket:
+                    pockets[k].append(pocket[k])
+            pocket_file = pdbfile.name.replace('_', '-')
+            ligand_file = Path(pocket_file).stem + '_' + Path(sdffile).name.replace('_', '-')
+            ligands['name'].append(ligand_file)
+            pockets['name'].append(pocket_file)
+            train_smiles.append(rdmol_to_smiles(rdmol))
+            if split in {'val', 'test'}:
+                pdb_sdf_dir = processed_dir / split
+                pdb_sdf_dir.mkdir(exist_ok=True)
+                # Copy PDB file
+                pdb_file_out = Path(pdb_sdf_dir, pocket_file)
+                shutil.copy(pdbfile, pdb_file_out)
+                # Copy SDF file
+                sdf_file_out = Path(pdb_sdf_dir, ligand_file)
+                shutil.copy(sdffile, sdf_file_out)
+        data = {'ligands': ligands, 'pockets': pockets}
+        torch.save(data, Path(processed_dir, f'{split}.pt'))
+        if split == 'train':
+            np.save(Path(processed_dir, 'train_smiles.npy'), train_smiles)
+        print(f"Processing {split} set took {(time() - tic) / 60.0:.2f} minutes")
+    # --------------------------------------------------------------------------
+    # Compute statistics & additional information
+    # --------------------------------------------------------------------------
+    train_data = torch.load(Path(processed_dir, f'train.pt'))
+    # Maximum molecule size
+    max_ligand_size = max([len(x) for x in train_data['ligands']['x']])
+    # Joint histogram of number of ligand and pocket nodes
+    pocket_coords = train_data['pockets']['x']
+    ligand_coords = train_data['ligands']['x']
+    n_nodes = get_n_nodes(ligand_coords, pocket_coords)
+    np.save(Path(processed_dir, 'size_distribution.npy'), n_nodes)
+    # Get histograms of ligand node types
+    lig_one_hot = [x.numpy() for x in train_data['ligands']['one_hot']]
+    ligand_hist = get_type_histogram(lig_one_hot, atom_encoder)
+    np.save(Path(processed_dir, 'ligand_type_histogram.npy'), ligand_hist)
+    # Get histograms of ligand edge types
+    lig_bond_one_hot = [x.numpy() for x in train_data['ligands']['bond_one_hot']]
+    ligand_bond_hist = get_type_histogram(lig_bond_one_hot, bond_encoder)
+    np.save(Path(processed_dir, 'ligand_bond_type_histogram.npy'), ligand_bond_hist)
+    # Write error report
+    error_str = ""
+    for k, v in failed.items():
+        error_str += f"{'Split':<15}:  {k[0]}\n"
+        error_str += f"{'Ligand':<15}:  {k[1]}\n"
+        error_str += f"{'Pocket':<15}:  {k[2]}\n"
+        error_str += f"{'Error type':<15}:  {v[0]}\n"
+        error_str += f"{'Error msg':<15}:  {v[1]}\n\n"
+    with open(Path(processed_dir, 'errors.txt'), 'w') as f:
+        f.write(error_str)
+    metadata = {
+        'max_ligand_size': max_ligand_size
+    }
+    with open(Path(processed_dir, 'metadata.yml'), 'w') as f:
+        yaml.dump(metadata, f, default_flow_style=False)

src/data/process_dpo_dataset.py ADDED Viewed

	@@ -0,0 +1,406 @@

+import argparse
+from pathlib import Path
+import numpy as np
+import random
+import shutil
+from time import time
+from collections import defaultdict
+from Bio.PDB import PDBParser
+from rdkit import Chem
+import torch
+from tqdm import tqdm
+import pandas as pd
+from itertools import combinations
+import sys
+basedir = Path(__file__).resolve().parent.parent.parent
+sys.path.append(str(basedir))
+from src.sbdd_metrics.metrics import REOSEvaluator, MedChemEvaluator, PoseBustersEvaluator, GninaEvalulator
+from src.data.data_utils import process_raw_pair, rdmol_to_smiles
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--smplsdir', type=Path, required=True)
+    parser.add_argument('--metrics-detailed', type=Path, required=False)
+    parser.add_argument('--ignore-missing-scores', action='store_true')
+    parser.add_argument('--datadir', type=Path, required=True)
+    parser.add_argument('--dpo-criterion', type=str, default='reos.all',
+                        choices=['reos.all', 'medchem.sa', 'medchem.qed', 'gnina.vina_efficiency','combined'])
+    parser.add_argument('--basedir', type=Path, default=None)
+    parser.add_argument('--pocket', type=str, default='CA+',
+                        choices=['side_chain_bead', 'CA+'])
+    parser.add_argument('--gnina', type=Path, default='gnina')
+    parser.add_argument('--random_seed', type=int, default=42)
+    parser.add_argument('--normal_modes', action='store_true')
+    parser.add_argument('--flex', action='store_true')
+    parser.add_argument('--toy', action='store_true')
+    parser.add_argument('--toy_size', type=int, default=100)
+    parser.add_argument('--n_pairs', type=int, default=5)
+    args = parser.parse_args()
+    return args
+def scan_smpl_dir(samples_dir):
+    samples_dir = Path(samples_dir)
+    subdirs = []
+    for subdir in tqdm(samples_dir.iterdir(), desc='Scanning samples'):
+        if not subdir.is_dir():
+            continue
+        if not sample_dir_valid(subdir):
+            continue
+        subdirs.append(subdir)
+    return subdirs
+def sample_dir_valid(samples_dir):
+    pocket = samples_dir / '0_pocket.pdb'
+    if not pocket.exists():
+        return False
+    ligands = list(samples_dir.glob('*_ligand.sdf'))
+    if len(ligands) < 2:
+        return False
+    for ligand in ligands:
+        if ligand.stat().st_size == 0:
+            return False
+    return True
+def return_winning_losing_smpl(score_1, score_2, criterion):
+    if criterion == 'reos.all':
+        if score_1 == score_2:
+            return None
+        return score_1 > score_2
+    elif criterion == 'medchem.sa':
+        if np.abs(score_1 - score_2) < 0.5:
+            return None
+        return score_1 < score_2
+    elif criterion == 'medchem.qed':
+        if np.abs(score_1 - score_2) < 0.1:
+            return None
+        return score_1 > score_2
+    elif criterion == 'gnina.vina_efficiency':
+        if np.abs(score_1 - score_2) < 0.1:
+            return None
+        return score_1 < score_2
+    elif criterion == 'combined':
+        score_reos_1, score_reos_2 = score_1['reos.all'], score_2['reos.all']
+        score_sa_1, score_sa_2 = score_1['medchem.sa'], score_2['medchem.sa']
+        score_qed_1, score_qed_2 = score_1['medchem.qed'], score_2['medchem.qed']
+        score_vina_1, score_vina_2 = score_1['gnina.vina_efficiency'], score_2['gnina.vina_efficiency']
+        if score_reos_1 == score_reos_2: return None
+        # checking consistency
+        reos_sign = score_reos_1 > score_reos_2
+        sa_sign = score_sa_1 < score_sa_2
+        qed_sign = score_qed_1 > score_qed_2
+        vina_sign = score_vina_1 < score_vina_2
+        signs = [reos_sign, sa_sign, qed_sign, vina_sign]
+        if all(signs) or not any(signs): return signs[0]
+        return None
+def compute_scores(sample_dirs, evaluator, criterion, n_pairs=5, toy=False, toy_size=100,
+                   precomp_scores=None, ignore_missing_scores=False):
+    samples = []
+    pose_evaluator = PoseBustersEvaluator()
+    pbar = tqdm(sample_dirs, desc='Computing scores for samples')
+    for dir in pbar:
+        pocket = dir / '0_pocket.pdb'
+        ligands = list(dir.glob('*_ligand.sdf'))
+        target_samples = []
+        for lig_path in ligands:
+            try:
+                mol = Chem.SDMolSupplier(str(lig_path))[0]
+                if mol is None:
+                    continue
+                smiles = rdmol_to_smiles(mol)
+            except Exception as e:
+                print('Failed to read ligand:', lig_path)
+                continue
+            if precomp_scores is not None and str(lig_path) in precomp_scores.index:
+                mol_props = precomp_scores.loc[str(lig_path)].to_dict()
+                if criterion == 'combined':
+                    if not 'reos.all' in mol_props or not 'medchem.sa' in mol_props or not 'medchem.qed' in mol_props or not 'gnina.vina_efficiency' in mol_props:
+                        print(f'Missing combined scores for ligand:', lig_path)
+                        continue
+                    mol_props['combined'] = {
+                        'reos.all': mol_props['reos.all'],
+                        'medchem.sa': mol_props['medchem.sa'],
+                        'medchem.qed': mol_props['medchem.qed'],
+                        'gnina.vina_efficiency': mol_props['gnina.vina_efficiency'],
+                        'combined': mol_props['gnina.vina_efficiency']
+                    }
+            else:
+                mol_props = {}
+            if criterion not in mol_props:
+                if ignore_missing_scores:
+                    print(f'Missing {criterion} for ligand:', lig_path)
+                    continue
+                print(f'Recomputing {criterion} for ligand:', lig_path)
+                try:
+                    eval_res = evaluator.evaluate(mol)
+                    criterion_cat = criterion.split('.')[0]
+                    eval_res = {f'{criterion_cat}.{k}': v for k, v in eval_res.items()}
+                    score = eval_res[criterion]
+                except:
+                    continue
+            else:
+                score = mol_props[criterion]
+            if 'posebusters.all' not in mol_props:
+                if ignore_missing_scores:
+                    print('Missing PoseBusters for ligand:', lig_path)
+                    continue
+                print('Recomputing PoseBusters for ligand:', lig_path)
+                try:
+                    pose_eval_res = pose_evaluator.evaluate(lig_path, pocket)
+                except:
+                    continue
+                if 'all' not in pose_eval_res or not pose_eval_res['all']:
+                    continue
+            else:
+                pose_eval_res = mol_props['posebusters.all']
+                if not pose_eval_res:
+                    continue
+            target_samples.append({
+                'smiles': smiles,
+                'score': score,
+                'ligand_path': lig_path,
+                'pocket_path': pocket
+            })
+        # Deduplicate by SMILES
+        unique_samples = {}
+        for sample in target_samples:
+            if sample['smiles'] not in unique_samples:
+                unique_samples[sample['smiles']] = sample
+        unique_samples = list(unique_samples.values())
+        if len(unique_samples) < 2:
+            continue
+        # Generate all possible pairs
+        all_pairs = list(combinations(unique_samples, 2))
+        # Calculate score differences and filter valid pairs
+        valid_pairs = []
+        for s1, s2 in all_pairs:
+            sign = return_winning_losing_smpl(s1['score'], s2['score'], criterion)
+            if sign is None:
+                continue
+            score_diff = abs(s1['score'] - s2['score']) if not criterion == 'combined' else \
+                         abs(s1['score']['combined'] - s2['score']['combined'])
+            if sign:
+                valid_pairs.append((s1, s2, score_diff))
+            elif sign is False:
+                valid_pairs.append((s2, s1, score_diff))
+        # Sort pairs by score difference (descending) and select top N pairs
+        valid_pairs.sort(key=lambda x: x[2], reverse=True)
+        used_ligand_paths = set()
+        selected_pairs = []
+        for winning, losing, score_diff in valid_pairs:
+            if winning['ligand_path'] in used_ligand_paths or losing['ligand_path'] in used_ligand_paths:
+                continue
+            selected_pairs.append((winning, losing, score_diff))
+            used_ligand_paths.add(winning['ligand_path'])
+            used_ligand_paths.add(losing['ligand_path'])
+            if len(selected_pairs) == n_pairs:
+                break
+        for winning, losing, _ in selected_pairs:
+            d = {
+                'score_w': winning['score'],
+                'score_l': losing['score'],
+                'pocket_p': winning['pocket_path'],
+                'ligand_p_w': winning['ligand_path'],
+                'ligand_p_l': losing['ligand_path']
+            }
+            if isinstance(winning['score'], dict):
+                for k, v in winning['score'].items():
+                    d[f'{k}_w'] = v
+                d['score_w'] = winning['score']['combined']
+            if isinstance(losing['score'], dict):
+                for k, v in losing['score'].items():
+                    d[f'{k}_l'] = v
+                d['score_l'] = losing['score']['combined']
+            samples.append(d)
+        pbar.set_postfix({'samples': len(samples)})
+        if toy and len(samples) >= toy_size:
+            break
+    return samples
+def main():
+    args = parse_args()
+    if 'reos' in args.dpo_criterion:
+        evaluator = REOSEvaluator()
+    elif 'medchem' in args.dpo_criterion:
+        evaluator = MedChemEvaluator()
+    elif 'gnina' in args.dpo_criterion:
+        evaluator = GninaEvalulator(gnina=args.gnina)
+    elif 'combined' in args.dpo_criterion:
+        evaluator = None # for combined criterion, metrics have to be computed separately
+        if args.metrics_detailed is None:
+            raise ValueError('For combined criterion, detailed metrics file has to be provided')
+        if not args.ignore_missing_scores:
+            raise ValueError('For combined criterion, --ignore-missing-scores flag has to be set')
+    else:
+        raise ValueError(f"Unknown DPO criterion: {args.dpo_criterion}")
+    # Make output directory
+    dirname = f"dpo_{args.dpo_criterion.replace('.','_')}_{args.pocket}"
+    if args.flex:
+        dirname += '_flex'
+    if args.normal_modes:
+        dirname += '_nma'
+    if args.toy:
+        dirname += '_toy'
+    processed_dir = Path(args.basedir, dirname)
+    processed_dir.mkdir(parents=True, exist_ok=True)
+    if (processed_dir / f'samples_{args.dpo_criterion}.csv').exists():
+        print(f"Samples already computed for criterion {args.dpo_criterion}, loading from file")
+        samples = pd.read_csv(processed_dir / f'samples_{args.dpo_criterion}.csv')
+        samples = [dict(row) for _, row in samples.iterrows()]
+        print(f"Found {len(samples)} winning/losing samples")
+    else:
+        print('Scanning sample directory...')
+        samples_dir = Path(args.smplsdir)
+        # scan dir
+        sample_dirs = scan_smpl_dir(samples_dir)
+        if args.metrics_detailed:
+            print(f'Loading precomputed scores from {args.metrics_detailed}')
+            precomp_scores = pd.read_csv(args.metrics_detailed)
+            precomp_scores = precomp_scores.set_index('sdf_file')
+        else:
+            precomp_scores = None
+        print(f'Found {len(sample_dirs)} valid sample directories')
+        print('Computing scores...')
+        samples = compute_scores(sample_dirs, evaluator, args.dpo_criterion,
+                                 n_pairs=args.n_pairs, toy=args.toy, toy_size=args.toy_size,
+                                    precomp_scores=precomp_scores,
+                                    ignore_missing_scores=args.ignore_missing_scores)
+        print(f'Found {len(samples)} winning/losing samples, saving to file')
+        pd.DataFrame(samples).to_csv(Path(processed_dir, f'samples_{args.dpo_criterion}.csv'), index=False)
+    data_split = {}
+    data_split['train'] = samples
+    if args.toy:
+        data_split['train'] = random.sample(samples, min(args.toy_size, len(data_split['train'])))
+    failed = {}
+    train_smiles = []
+    for split in data_split.keys():
+        print(f"Processing {split} dataset...")
+        ligands_w = defaultdict(list)
+        ligands_l = defaultdict(list)
+        pockets = defaultdict(list)
+        tic = time()
+        pbar = tqdm(data_split[split])
+        for entry in pbar:
+            pbar.set_description(f'#failed: {len(failed)}')
+            pdbfile = Path(entry['pocket_p'])
+            entry['ligand_p_w'] = Path(entry['ligand_p_w'])
+            entry['ligand_p_l'] = Path(entry['ligand_p_l'])
+            entry['ligand_w'] = Chem.SDMolSupplier(str(entry['ligand_p_w']))[0]
+            entry['ligand_l'] = Chem.SDMolSupplier(str(entry['ligand_p_l']))[0]
+            try:
+                pdb_model = PDBParser(QUIET=True).get_structure('', pdbfile)[0]
+                ligand_w, pocket = process_raw_pair(
+                    pdb_model, entry['ligand_w'], pocket_representation=args.pocket,
+                    compute_nerf_params=args.flex, compute_bb_frames=args.flex,
+                    nma_input=pdbfile if args.normal_modes else None)
+                ligand_l, _ = process_raw_pair(
+                    pdb_model, entry['ligand_l'], pocket_representation=args.pocket,
+                    compute_nerf_params=args.flex, compute_bb_frames=args.flex,
+                    nma_input=pdbfile if args.normal_modes else None)
+            except (KeyError, AssertionError, FileNotFoundError, IndexError,
+                    ValueError, AttributeError) as e:
+                failed[(split, entry['ligand_p_w'], entry['ligand_p_l'],  pdbfile)] \
+                    = (type(e).__name__, str(e))
+                continue
+            nerf_keys = ['fixed_coord', 'atom_mask', 'nerf_indices', 'length', 'theta', 'chi', 'ddihedral', 'chi_indices']
+            for k in ['x', 'one_hot', 'bonds', 'bond_one_hot', 'v', 'nma_vec'] + nerf_keys + ['axis_angle']:
+                if k in ligand_w:
+                    ligands_w[k].append(ligand_w[k])
+                    ligands_l[k].append(ligand_l[k])
+                if k in pocket:
+                    pockets[k].append(pocket[k])
+            smpl_n = pdbfile.parent.name
+            pocket_file = f'{smpl_n}__{pdbfile.stem}.pdb'
+            ligand_file_w = f'{smpl_n}__{entry["ligand_p_w"].stem}.sdf'
+            ligand_file_l = f'{smpl_n}__{entry["ligand_p_l"].stem}.sdf'
+            ligands_w['name'].append(ligand_file_w)
+            ligands_l['name'].append(ligand_file_l)
+            pockets['name'].append(pocket_file)
+            train_smiles.append(rdmol_to_smiles(entry['ligand_w']))
+            train_smiles.append(rdmol_to_smiles(entry['ligand_l']))
+        data = {'ligands_w': ligands_w,
+                'ligands_l': ligands_l,
+                'pockets': pockets}
+        torch.save(data, Path(processed_dir, f'{split}.pt'))
+        if split == 'train':
+            np.save(Path(processed_dir, 'train_smiles.npy'), train_smiles)
+        print(f"Processing {split} set took {(time() - tic) / 60.0:.2f} minutes")
+    # cp stats from original dataset
+    size_distr_p = Path(args.datadir, 'size_distribution.npy')
+    type_histo_p = Path(args.datadir, 'ligand_type_histogram.npy')
+    bond_histo_p = Path(args.datadir, 'ligand_bond_type_histogram.npy')
+    metadata_p = Path(args.datadir, 'metadata.yml')
+    shutil.copy(size_distr_p, processed_dir)
+    shutil.copy(type_histo_p, processed_dir)
+    shutil.copy(bond_histo_p, processed_dir)
+    shutil.copy(metadata_p, processed_dir)
+    # cp val and test .pt and dirs
+    val_dir = Path(args.datadir, 'val')
+    test_dir = Path(args.datadir, 'test')
+    val_pt = Path(args.datadir, 'val.pt')
+    test_pt = Path(args.datadir, 'test.pt')
+    assert val_dir.exists() and test_dir.exists() and val_pt.exists() and test_pt.exists()
+    if (processed_dir / 'val').exists():
+        shutil.rmtree(processed_dir / 'val')
+    if (processed_dir / 'test').exists():
+        shutil.rmtree(processed_dir / 'test')
+    shutil.copytree(val_dir, processed_dir / 'val')
+    shutil.copytree(test_dir, processed_dir / 'test')
+    shutil.copy(val_pt, processed_dir)
+    shutil.copy(test_pt, processed_dir)
+    # Write error report
+    error_str = ""
+    for k, v in failed.items():
+        error_str += f"{'Split':<15}:  {k[0]}\n"
+        error_str += f"{'Ligand W':<15}:  {k[1]}\n"
+        error_str += f"{'Ligand L':<15}:  {k[2]}\n"
+        error_str += f"{'Pocket':<15}:  {k[3]}\n"
+        error_str += f"{'Error type':<15}:  {v[0]}\n"
+        error_str += f"{'Error msg':<15}:  {v[1]}\n\n"
+    with open(Path(processed_dir, 'errors.txt'), 'w') as f:
+        f.write(error_str)
+    with open(Path(processed_dir, 'dataset_config.txt'), 'w') as f:
+        f.write(str(args))
+if __name__ == '__main__':
+    main()

src/data/sanifix.py ADDED Viewed

	@@ -0,0 +1,159 @@

+""" sanifix4.py
+  Contribution from James Davidson
+  adapted from: https://github.com/abradle/rdkitserver/blob/master/MYSITE/src/testproject/mol_parsing/sanifix.py
+"""
+from rdkit import Chem
+from rdkit.Chem import AllChem
+import warnings
+def _FragIndicesToMol(oMol,indices):
+    em = Chem.EditableMol(Chem.Mol())
+    newIndices={}
+    for i,idx in enumerate(indices):
+        em.AddAtom(oMol.GetAtomWithIdx(idx))
+        newIndices[idx]=i
+    for i,idx in enumerate(indices):
+        at = oMol.GetAtomWithIdx(idx)
+        for bond in at.GetBonds():
+            if bond.GetBeginAtomIdx()==idx:
+                oidx = bond.GetEndAtomIdx()
+            else:
+                oidx = bond.GetBeginAtomIdx()
+            # make sure every bond only gets added once:
+            if oidx<idx:
+                continue
+            em.AddBond(newIndices[idx],newIndices[oidx],bond.GetBondType())
+    res = em.GetMol()
+    res.ClearComputedProps()
+    Chem.GetSymmSSSR(res)
+    res.UpdatePropertyCache(False)
+    res._idxMap=newIndices
+    return res
+def _recursivelyModifyNs(mol,matches,indices=None):
+    if indices is None:
+        indices=[]
+    res=None
+    while len(matches) and res is None:
+        tIndices=indices[:]
+        nextIdx = matches.pop(0)
+        tIndices.append(nextIdx)
+        nm = Chem.Mol(mol)
+        nm.GetAtomWithIdx(nextIdx).SetNoImplicit(True)
+        nm.GetAtomWithIdx(nextIdx).SetNumExplicitHs(1)
+        cp = Chem.Mol(nm)
+        try:
+            Chem.SanitizeMol(cp)
+        except ValueError:
+            res,indices = _recursivelyModifyNs(nm,matches,indices=tIndices)
+        else:
+            indices=tIndices
+            res=cp
+    return res,indices
+def AdjustAromaticNs(m,nitrogenPattern='[n&D2&H0;r5,r6]'):
+    """
+       default nitrogen pattern matches Ns in 5 rings and 6 rings in order to be able
+       to fix: O=c1ccncc1
+    """
+    Chem.GetSymmSSSR(m)
+    m.UpdatePropertyCache(False)
+    # break non-ring bonds linking rings:
+    em = Chem.EditableMol(m)
+    linkers = m.GetSubstructMatches(Chem.MolFromSmarts('[r]!@[r]'))
+    plsFix=set()
+    for a,b in linkers:
+        em.RemoveBond(a,b)
+        plsFix.add(a)
+        plsFix.add(b)
+    nm = em.GetMol()
+    for at in plsFix:
+        at=nm.GetAtomWithIdx(at)
+        if at.GetIsAromatic() and at.GetAtomicNum()==7:
+            at.SetNumExplicitHs(1)
+            at.SetNoImplicit(True)
+    # build molecules from the fragments:
+    fragLists = Chem.GetMolFrags(nm)
+    frags = [_FragIndicesToMol(nm,x) for x in fragLists]
+    # loop through the fragments in turn and try to aromatize them:
+    ok=True
+    for i,frag in enumerate(frags):
+        cp = Chem.Mol(frag)
+        try:
+            Chem.SanitizeMol(cp)
+        except ValueError:
+            matches = [x[0] for x in frag.GetSubstructMatches(Chem.MolFromSmarts(nitrogenPattern))]
+            lres,indices=_recursivelyModifyNs(frag,matches)
+            if not lres:
+                #print 'frag %d failed (%s)'%(i,str(fragLists[i]))
+                ok=False
+                break
+            else:
+                revMap={}
+                for k,v in frag._idxMap.items():
+                    revMap[v]=k
+                for idx in indices:
+                    oatom = m.GetAtomWithIdx(revMap[idx])
+                    oatom.SetNoImplicit(True)
+                    oatom.SetNumExplicitHs(1)
+    if not ok:
+        return None
+    return m
+def fix_mol(m):
+    if m is None:
+        return None
+    try:
+        m.UpdatePropertyCache(False)
+        cp = Chem.Mol(m.ToBinary())
+        Chem.SanitizeMol(cp)
+        m = cp
+        # print('fine:',Chem.MolToSmiles(m))
+        warnings.warn(f'fine: {Chem.MolToSmiles(m)}')
+        return m
+    except ValueError:
+        # print('adjust')
+        warnings.warn('adjust')
+        nm=AdjustAromaticNs(m)
+        if nm is not None:
+            try:
+                Chem.SanitizeMol(nm)
+                # print('fixed:',Chem.MolToSmiles(nm))
+                warnings.warn(f'fixed: {Chem.MolToSmiles(nm)}')
+            except ValueError:
+                # print('still broken')
+                warnings.warn('still broken')
+        else:
+            # print('still broken')
+            warnings.warn('still broken')
+        return nm
+if __name__=='__main__':
+    ms = [x for x in open("18.sdf").read().split("$$$$\n")]
+    for txt_m in ms:
+        if not txt_m:
+            continue
+        m = Chem.MolFromMolBlock(txt_m, False)
+        print('#---------------------')
+        try:
+            m.UpdatePropertyCache(False)
+            cp = Chem.Mol(m.ToBinary())
+            Chem.SanitizeMol(cp)
+            m = cp
+            print('fine:',Chem.MolToSmiles(m))
+        except ValueError:
+            print('adjust')
+            nm=AdjustAromaticNs(m)
+            if nm is not None:
+                Chem.SanitizeMol(nm)
+                print('fixed:',Chem.MolToSmiles(nm))
+            else:
+                print('still broken')

src/data/so3_utils.py ADDED Viewed

	@@ -0,0 +1,450 @@

+import math
+import torch
+def _batch_trace(m):
+    return torch.einsum('...ii', m)
+def regularize(point, eps=1e-6):
+    """
+    Norm of the rotation vector should be between 0 and pi.
+    Inverts the direction of the rotation axis if the value is between pi and 2 pi.
+    Args:
+        point, (n, 3)
+    Returns:
+        regularized point, (n, 3)
+    """
+    theta = torch.linalg.norm(point, axis=-1)
+    # angle in [0, 2pi)
+    theta_wrapped = theta % (2 * math.pi)
+    inv_mask = theta_wrapped > math.pi
+    # angle in [0, pi) & invert
+    theta_wrapped[inv_mask] = -1 * (2 * math.pi - theta_wrapped[inv_mask])
+    # apply
+    theta = torch.clamp(theta, min=eps)
+    point = point * (theta_wrapped / theta).unsqueeze(-1)
+    assert not point.isnan().any()
+    return point
+def random_uniform(n_samples, device=None):
+    """
+    Follow geomstats implementation:
+    https://geomstats.github.io/_modules/geomstats/geometry/special_orthogonal.html
+    Args:
+        n_samples: int
+    Returns:
+        rotation vectors, (n, 3)
+    """
+    random_point = (torch.rand(n_samples, 3, device=device) * 2 - 1) * math.pi
+    random_point = regularize(random_point)
+    return random_point
+def hat(rot_vec):
+    """
+    Maps R^3 vector to a skew-symmetric matrix r (i.e. r \in R^{3x3} and r^T = -r).
+    Since we have the identity rv = rot_vec x v for all v \in R^3, this is
+    identical to a cross-product-matrix representation of rot_vec.
+    rot_vec x v = hat(rot_vec)^T v
+    See also:
+    https://en.wikipedia.org/wiki/Cross_product#Conversion_to_matrix_multiplication
+    https://en.wikipedia.org/wiki/Hat_notation#Cross_product
+    Args:
+        rot_vec: (n, 3)
+    Returns:
+        skew-symmetric matrices (n, 3, 3)
+    """
+    basis = torch.tensor([
+        [[0., 0., 0.], [0., 0., -1.], [0., 1., 0.]],
+        [[0., 0., 1.], [0., 0., 0.], [-1., 0., 0.]],
+        [[0., -1., 0.], [1., 0., 0.], [0., 0., 0.]]
+    ], device=rot_vec.device)
+    # basis = torch.tensor([
+    #     [[0., 0., 0.], [0., 0., 1.], [0., -1., 0.]],
+    #     [[0., 0., -1.], [0., 0., 0.], [1., 0., 0.]],
+    #     [[0., 1., 0.], [-1., 0., 0.], [0., 0., 0.]]
+    # ], device=rot_vec.device)
+    return torch.einsum('...i,ijk->...jk', rot_vec, basis)
+def inv_hat(skew_mat):
+    """
+    Inverse of hat operation
+    Args:
+        skew_mat: skew-symmetric matrices (n, 3, 3)
+    Returns:
+        rotation vectors, (n, 3)
+    """
+    assert torch.allclose(-skew_mat, skew_mat.transpose(-2, -1), atol=1e-4), \
+        f"Input not skew-symmetric (err={(-skew_mat - skew_mat.transpose(-2, -1)).abs().max():.4g})"
+    # vec = torch.stack([
+    #     skew_mat[:, 1, 2],
+    #     skew_mat[:, 2, 1],
+    #     skew_mat[:, 0, 1]
+    # ], dim=1)
+    vec = torch.stack([
+        skew_mat[:, 2, 1],
+        skew_mat[:, 0, 2],
+        skew_mat[:, 1, 0]
+    ], dim=1)
+    return vec
+def matrix_from_rotation_vector(axis_angle, eps=1e-6):
+    """
+    Args:
+        axis_angle: (n, 3)
+    Returns:
+        rotation matrices, (n, 3, 3)
+    """
+    axis_angle = regularize(axis_angle)
+    angle = axis_angle.norm(dim=-1)
+    _norm = torch.clamp(angle, min=eps).unsqueeze(-1)
+    skew_mat = hat(axis_angle / _norm)
+    # https://en.wikipedia.org/wiki/Rodrigues%27_rotation_formula#Matrix_notation
+    _id = torch.eye(3, device=axis_angle.device).unsqueeze(0)
+    rot_mat = _id + \
+              torch.sin(angle)[:, None, None] * skew_mat + \
+              (1 - torch.cos(angle))[:, None, None] * torch.bmm(skew_mat, skew_mat)
+    return rot_mat
+class safe_acos(torch.autograd.Function):
+    """
+    Implementation of arccos that avoids NaN in backward pass.
+    https://github.com/pytorch/pytorch/issues/8069#issuecomment-2041223872
+    """
+    EPS = 1e-4
+    @classmethod
+    def d_acos_dx(cls, x):
+        x = torch.clamp(x, min=-1. + cls.EPS, max=1. - cls.EPS)
+        return -1.0 / (1 - x**2).sqrt()
+    @staticmethod
+    def forward(ctx, input):
+        ctx.save_for_backward(input)
+        return input.acos()
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, = ctx.saved_tensors
+        return grad_output * safe_acos.d_acos_dx(input)
+def rotation_vector_from_matrix(rot_mat, approx=1e-4):
+    """
+    Args:
+        rot_mat: (n, 3, 3)
+        approx: float, minimum angle below which an approximation will be used
+            for numerical stability
+    Returns:
+        rotation vector, (n, 3)
+    """
+    # https://en.wikipedia.org/wiki/Rotation_matrix#Conversion_from_rotation_matrix_to_axis%E2%80%93angle
+    # https://en.wikipedia.org/wiki/Axis%E2%80%93angle_representation#Log_map_from_SO(3)_to_%F0%9D%94%B0%F0%9D%94%AC(3)
+    # determine axis
+    skew_mat = rot_mat - rot_mat.transpose(-2, -1)
+    # determine the angle
+    cos_angle = 0.5 * (_batch_trace(rot_mat) - 1)
+    # arccos is only defined between -1 and 1
+    assert torch.all(cos_angle.abs() <= 1 + 1e-6)
+    cos_angle = torch.clamp(cos_angle, min=-1., max=1.)
+    # abs_angle = torch.arccos(cos_angle)
+    abs_angle = safe_acos.apply(cos_angle)
+    # avoid numerical instability; use sin(x) \approx x for small x
+    close_to_0 = abs_angle < approx
+    _fac = torch.empty_like(abs_angle)
+    _fac[close_to_0] = 0.5
+    _fac[~close_to_0] = 0.5 * abs_angle[~close_to_0] / torch.sin(abs_angle[~close_to_0])
+    axis_angle = inv_hat(_fac[:, None, None] * skew_mat)
+    return regularize(axis_angle)
+def get_jacobian(point, left=True, inverse=False, eps=1e-4):
+    # # From Geomstats: https://geomstats.github.io/_modules/geomstats/geometry/special_orthogonal.html
+    # jacobian = so3_vector.jacobian_translation(point, left)
+    #
+    # if inverse:
+    #     jacobian = torch.linalg.inv(jacobian)
+    # Right Jacobian defined as J_r(theta) = \partial exp([theta]_x) / \partial theta
+    # https://math.stackexchange.com/questions/301533/jacobian-involving-so3-exponential-map-logr-expm
+    # Source:
+    # Chirikjian, Gregory S. Stochastic models, information theory, and Lie
+    # groups, volume 2: Analytic methods and modern applications. Vol. 2.
+    # Springer Science & Business Media, 2011. (page 40)
+    # NOTE: the definitions of 'inverse' and 'left' in the book are the opposite
+    #  of their meanings in Geomstats, whose functionality we're mimicking here.
+    #  This explains the differences in the equations.
+    angle_squared = point.square().sum(-1)
+    angle = angle_squared.sqrt()
+    skew_mat = hat(point)
+    assert torch.all(angle <= math.pi)
+    close_to_0 = angle < eps
+    close_to_pi = (math.pi - angle) < eps
+    angle = angle[:, None, None]
+    angle_squared = angle_squared[:, None, None]
+    if inverse:
+        # _jacobian = torch.eye(3, device=point.device).unsqueeze(0) + \
+        #            (1 - torch.cos(angle)) / angle_squared * skew_mat + \
+        #            (angle - torch.sin(angle)) / angle ** 3 * (skew_mat @ skew_mat)
+        _term1 = torch.empty_like(angle)
+        _term1[close_to_0] = 0.5  # approximate with value at zero
+        _term1[~close_to_0] = (1 - torch.cos(angle)) / angle_squared
+        _term2 = torch.empty_like(angle)
+        _term2[close_to_0] = 1 / 6  # approximate with value at zero
+        _term2[~close_to_0] = (angle - torch.sin(angle)) / angle ** 3
+        jacobian = torch.eye(3, device=point.device).unsqueeze(0) + \
+                   _term1 * skew_mat + _term2 * (skew_mat @ skew_mat)
+        # assert torch.allclose(jacobian, _jacobian, atol=1e-4)
+    else:
+        # _jacobian = torch.eye(3, device=point.device).unsqueeze(0) - 0.5 * skew_mat + \
+        #            (1 / angle_squared - (1 + torch.cos(angle)) / (2 * angle * torch.sin(angle))) * (skew_mat @ skew_mat)
+        _term1 = torch.empty_like(angle)
+        _term1[close_to_0] = 1 / 12  # approximate with value at zero
+        _term1[close_to_pi] = 1 / math.pi**2  # approximate with value at pi
+        default = ~close_to_0 & ~close_to_pi
+        _term1[default] = 1 / angle_squared[default] - \
+                        (1 + torch.cos(angle[default])) / (2 * angle[default] * torch.sin(angle[default]))
+        jacobian = torch.eye(3, device=point.device).unsqueeze(0) - \
+                    0.5 * skew_mat + _term1 * (skew_mat @ skew_mat)
+        # assert torch.allclose(jacobian, _jacobian, atol=1e-4)
+    if left:
+        jacobian = jacobian.transpose(-2, -1)
+    return jacobian
+def compose_rotations(rot_vec_1, rot_vec_2):
+    rot_mat_1 = matrix_from_rotation_vector(rot_vec_1)
+    rot_mat_2 = matrix_from_rotation_vector(rot_vec_2)
+    rot_mat_out = torch.bmm(rot_mat_1, rot_mat_2)
+    return rotation_vector_from_matrix(rot_mat_out)
+def exp(tangent):
+    """
+    Exponential map at identity.
+    Args:
+        tangent: vector on the tangent space, (n, 3)
+    Returns:
+        rotation vector on the manifold, (n, 3)
+    """
+    # rotations are already represented by rotation vectors
+    exp_from_identity = regularize(tangent)
+    return exp_from_identity
+def exp_not_from_identity(tangent_vec, base_point):
+    """
+    Exponential map at base point.
+    Args:
+        tangent_vec: vector on the tangent plane, (n, 3)
+        base_point: base point on the manifold, (n, 3)
+    Returns:
+        new point on the manifold, (n, 3)
+    """
+    tangent_vec = regularize(tangent_vec)
+    base_point = regularize(base_point)
+    # Lie algebra is the tangent space at the identity element of a Lie group
+    # -> to identity
+    jacobian = get_jacobian(base_point, left=True, inverse=True)
+    tangent_vec_at_id = torch.einsum("...ij,...j->...i", jacobian, tangent_vec)
+    # exponential map from identity
+    exp_from_identity = exp(tangent_vec_at_id)
+    # -> back to base point
+    return compose_rotations(base_point, exp_from_identity)
+def log(rot_vec, as_skew=False):
+    """
+    Logarithm map from tangent space at the identity.
+    Args:
+        rot_vec: point on the manifold, (n, 3)
+    Returns:
+        vector on the tangent space, (n, 3)
+    """
+    # rotations are already represented by rotation vectors
+    # log_from_id = regularize(rot_vec)
+    log_from_id = rot_vec
+    if as_skew:
+        log_from_id = hat(log_from_id)
+    return log_from_id
+def log_not_from_identity(point, base_point):
+    """
+    Logarithm map of point from base point.
+    Args:
+        point: point on the manifold, (n, 3)
+        base_point: base point on the manifold, (n, 3)
+    Returns:
+        vector on the tangent plane, (n, 3)
+    """
+    point = regularize(point)
+    base_point = regularize(base_point)
+    inv_base_point = -1 * base_point
+    point_near_id = compose_rotations(inv_base_point, point)
+    # logarithm map from identity
+    log_from_id = log(point_near_id)
+    jacobian = get_jacobian(base_point, inverse=False)
+    tangent_vec_at_id = torch.einsum("...ij,...j->...i", jacobian, log_from_id)
+    return tangent_vec_at_id
+if __name__ == "__main__":
+    import os
+    os.environ['GEOMSTATS_BACKEND'] = "pytorch"
+    import scipy.optimize  # does not seem to be imported correctly when just loading geomstats
+    default_dtype = torch.get_default_dtype()
+    from geomstats.geometry.special_orthogonal import SpecialOrthogonal
+    torch.set_default_dtype(default_dtype)  # Geomstats changes default type when imported
+    so3_vector = SpecialOrthogonal(n=3, point_type="vector")
+    # decorator
+    if torch.__version__ >= '2.0.0':
+        GEOMSTATS_DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+        def geomstats_tensor_type(func):
+            def inner(*args, **kwargs):
+                with torch.device(GEOMSTATS_DEVICE):
+                    out = func(*args, **kwargs)
+                return out
+            return inner
+    else:
+        GEOMSTATS_TENSOR_TYPE = 'torch.cuda.FloatTensor' if torch.cuda.is_available() else 'torch.FloatTensor'
+        # GEOMSTATS_TENSOR_TYPE = 'torch.cuda.DoubleTensor' if torch.cuda.is_available() else 'torch.DoubleTensor'
+        def geomstats_tensor_type(func):
+            def inner(*args, **kwargs):
+                # tensor_type_before = TODO
+                torch.set_default_tensor_type(GEOMSTATS_TENSOR_TYPE)
+                out = func(*args, **kwargs)
+                # torch.set_default_tensor_type(tensor_type_before)
+                torch.set_default_tensor_type('torch.FloatTensor')
+                return out
+            return inner
+    @geomstats_tensor_type
+    def gs_matrix_from_rotation_vector(*args, **kwargs):
+        return so3_vector.matrix_from_rotation_vector(*args, **kwargs)
+    @geomstats_tensor_type
+    def gs_rotation_vector_from_matrix(*args, **kwargs):
+        return so3_vector.rotation_vector_from_matrix(*args, **kwargs)
+    @geomstats_tensor_type
+    def gs_exp_not_from_identity(*args, **kwargs):
+        return so3_vector.exp_not_from_identity(*args, **kwargs)
+    @geomstats_tensor_type
+    def gs_log_not_from_identity(*args, **kwargs):
+        # norm of the rotation vector will be between 0 and pi
+        return so3_vector.log_not_from_identity(*args, **kwargs)
+    @geomstats_tensor_type
+    def compose(*args, **kwargs):
+        return so3_vector.compose(*args, **kwargs)
+    @geomstats_tensor_type
+    def inverse(*args, **kwargs):
+        return so3_vector.inverse(*args, **kwargs)
+    @geomstats_tensor_type
+    def gs_random_uniform(*args, **kwargs):
+        return so3_vector.random_uniform(*args, **kwargs)
+    #############
+    # RUN TESTS #
+    #############
+    n = 16
+    device = 'cuda' if torch.cuda.is_available() else None
+    ### regularize ###
+    # vec = (torch.rand(n, 3) * 2 - 1) * math.pi
+    vec = (torch.rand(n, 3) * 4 - 2) * math.pi
+    axis_angle = regularize(vec)
+    assert torch.all(torch.cross(vec, axis_angle).norm(dim=-1) < 1e-5), "not all vectors collinear"
+    assert torch.all(axis_angle.norm(dim=-1) < math.pi) & torch.all(axis_angle.norm(dim=-1) >= 0), "norm not between 0 and pi"
+    ### matrix_from_rotation_vector ###
+    rot_vec = random_uniform(16, device=device)
+    assert torch.allclose(matrix_from_rotation_vector(rot_vec),
+                          gs_matrix_from_rotation_vector(rot_vec), atol=1e-06)
+    ### rotation_vector_from_matrix ###
+    rot_vec = random_uniform(16, device=device)
+    rot_mat = matrix_from_rotation_vector(rot_vec)
+    assert torch.allclose(rotation_vector_from_matrix(rot_mat),
+                          gs_rotation_vector_from_matrix(rot_mat), atol=1e-05)
+    ### exp_not_from_identity ###
+    tangent_vec = random_uniform(16, device=device)
+    base_pt = random_uniform(16, device=device)
+    my_val = exp_not_from_identity(tangent_vec, base_pt)
+    gs_val = gs_exp_not_from_identity(tangent_vec, base_pt)
+    assert torch.allclose(my_val, gs_val, atol=1e-03), (my_val - gs_val).abs().max()
+    ### log_not_from_identity ###
+    pt = random_uniform(16, device=device)
+    base_pt = random_uniform(16, device=device)
+    my_val = log_not_from_identity(pt, base_pt)
+    gs_val = gs_log_not_from_identity(pt, base_pt)
+    assert torch.allclose(my_val, gs_val, atol=1e-03), (my_val - gs_val).abs().max()
+    print("All tests successful!")

src/default/size_distribution.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4e677a30c4b972051499bb5577a0de773e4f92ec54c282d432f94873406ec7e
+size 158488

src/generate.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import argparse
+import sys
+import os
+import warnings
+import tempfile
+import pandas as pd
+from Bio.PDB import PDBParser
+from pathlib import Path
+from rdkit import Chem
+from torch.utils.data import DataLoader
+from functools import partial
+basedir = Path(__file__).resolve().parent.parent
+sys.path.append(str(basedir))
+warnings.filterwarnings("ignore")
+from src import utils
+from src.data.dataset import ProcessedLigandPocketDataset
+from src.data.data_utils import TensorDict, process_raw_pair
+from src.model.lightning import DrugFlow
+from src.sbdd_metrics.metrics import FullEvaluator
+from tqdm import tqdm
+from pdb import set_trace
+def aggregate_metrics(table):
+    agg_col = 'posebusters'
+    total = 0
+    table[agg_col] = 0
+    for column in table.columns:
+        if column.startswith(agg_col) and column != agg_col:
+            table[agg_col] += table[column].fillna(0).astype(float)
+            total += 1
+    table[agg_col] = table[agg_col] / total
+    agg_col = 'reos'
+    total = 0
+    table[agg_col] = 0
+    for column in table.columns:
+        if column.startswith(agg_col) and column != agg_col:
+            table[agg_col] += table[column].fillna(0).astype(float)
+            total += 1
+    table[agg_col] = table[agg_col] / total
+    agg_col = 'chembl_ring_systems'
+    total = 0
+    table[agg_col] = 0
+    for column in table.columns:
+        if column.startswith(agg_col) and column != agg_col and not column.endswith('smi'):
+            table[agg_col] += table[column].fillna(0).astype(float)
+            total += 1
+    table[agg_col] = table[agg_col] / total
+    return table
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument('--protein', type=str, required=True, help="Input PDB file.")
+    p.add_argument('--ref_ligand', type=str, required=True, help="SDF file with reference ligand used to define the pocket.")
+    p.add_argument('--checkpoint', type=str, required=True, help="Model checkpoint file.")
+    p.add_argument('--molecule_size', type=str, required=False, default=None, help="Maximum number of atoms in the sampled molecules. Can be a single number or a range, e.g. '15,20'. If None, size will be sampled.")
+    p.add_argument('--output', type=str, required=False, default='samples.sdf', help="Output file.")
+    p.add_argument('--n_samples', type=int, required=False, default=10, help="Number of sampled molecules.")
+    p.add_argument('--batch_size', type=int, required=False, default=32, help="Batch size.")
+    p.add_argument('--pocket_distance_cutoff', type=float, required=False, default=8.0, help="Distance cutoff to define the pocket around the reference ligand.")
+    p.add_argument('--n_steps', type=int, required=False, default=None, help="Number of denoising steps.")
+    p.add_argument('--device', type=str, required=False, default='cuda:0', help="Device to use.")
+    p.add_argument('--datadir', type=Path, required=False, default=Path(basedir, 'src', 'default'), help="Needs to be specified to sample molecule sizes.")
+    p.add_argument('--seed', type=int, required=False, default=42, help="Random seed.")
+    p.add_argument('--filter', action='store_true', required=False, default=False, help="Apply basic filters and keep sampling until `n_samples` molecules passing these filters are found.")
+    p.add_argument('--metrics_output', type=str, required=False, default=None, help="If provided, metrics will be computed and saved in csv format at this location.")
+    p.add_argument('--gnina', type=str, required=False, default=None, help="Path to a gnina executable. Required for computing docking scores.")
+    p.add_argument('--reduce', type=str, required=False, default=None, help="Path to a reduce executable. Required for computing interactions.")
+    args = p.parse_args()
+    utils.set_deterministic(seed=args.seed)
+    utils.disable_rdkit_logging()
+    if args.molecule_size is None and (args.datadir is None or not args.datadir.exists()):
+        raise NotImplementedError(
+            "Please provide a path to the processed dataset (using `--datadir`) "\
+            "to infer the number of nodes. It contains the size distribution histogram."
+        )
+    if not args.filter:
+        args.batch_size = min(args.batch_size, args.n_samples)
+    # Loading model
+    chkpt_path = Path(args.checkpoint)
+    chkpt_name = chkpt_path.parts[-1].split('.')[0]
+    model = DrugFlow.load_from_checkpoint(args.checkpoint, map_location=args.device, strict=False)
+    if args.datadir is not None:
+        model.datadir = args.datadir
+    model.setup(stage='generation')
+    model.batch_size = model.eval_batch_size = args.batch_size
+    model.eval().to(args.device)
+    if args.n_steps is not None:
+        model.T = args.n_steps
+    # Loading size model
+    size_model = None
+    molecule_size = None
+    molecule_size_boundaries = None
+    if args.molecule_size is not None:
+        if args.molecule_size.isdigit():
+            molecule_size = int(args.molecule_size)
+            print(f'Will generate molecules of size {molecule_size}')
+        else:
+            boundaries = [x.strip() for x in args.molecule_size.split(',')]
+            assert len(boundaries) == 2 and boundaries[0].isdigit() and boundaries[1].isdigit()
+            left = int(boundaries[0])
+            right = int(boundaries[1])
+            molecule_size = f"uniform_{left}_{right}"
+            print(f'Will generate molecules with numbers of atoms sampled from U({left}, {right})')
+    # Preparing input
+    pdb_model = PDBParser(QUIET=True).get_structure('', args.protein)[0]
+    rdmol = Chem.SDMolSupplier(str(args.ref_ligand))[0]
+    ligand, pocket = process_raw_pair(
+        pdb_model, rdmol,
+        dist_cutoff=args.pocket_distance_cutoff,
+        pocket_representation=model.pocket_representation,
+        compute_nerf_params=True,
+        nma_input=args.protein if model.dynamics.add_nma_feat else None
+    )
+    ligand['name'] = 'ligand'
+    dataset = [{'ligand': ligand, 'pocket': pocket} for _ in range(args.batch_size)]
+    dataloader = DataLoader(
+        dataset=dataset,
+        batch_size=args.batch_size,
+        collate_fn=partial(ProcessedLigandPocketDataset.collate_fn, ligand_transform=None),
+        pin_memory=True
+    )
+    # Start sampling
+    smiles = set()
+    sampled_molecules = []
+    metrics = []
+    Path(args.output).parent.absolute().mkdir(parents=True, exist_ok=True)
+    print(f'Will generate {args.n_samples} samples')
+    evaluator = FullEvaluator(gnina=args.gnina, reduce=args.reduce)
+    with tqdm(total=args.n_samples) as pbar:
+        while len(sampled_molecules) < args.n_samples:
+            for i, data in enumerate(dataloader):
+                new_data = {
+                    'ligand': TensorDict(**data['ligand']).to(args.device),
+                    'pocket': TensorDict(**data['pocket']).to(args.device),
+                }
+                rdmols, rdpockets, _ = model.sample(
+                    new_data,
+                    n_samples=1,
+                    timesteps=args.n_steps,
+                    num_nodes=molecule_size,
+                )
+                if args.filter or (args.metrics_output is not None):
+                    results = []
+                    with tempfile.TemporaryDirectory() as tmpdir:
+                        for mol, receptor in zip(rdmols, rdpockets):
+                            receptor_path = Path(tmpdir, 'receptor.pdb')
+                            Chem.MolToPDBFile(receptor, str(receptor_path))
+                            results.append(evaluator(mol, receptor_path))
+                    table = pd.DataFrame(results)
+                    table['novel'] = ~table['representation.smiles'].isin(smiles)
+                    table = aggregate_metrics(table)
+                added_molecules = 0
+                if args.filter:
+                    table['passed_filters'] = (
+                        (table['posebusters'] == 1) &
+                        # (table['reos'] == 1) &
+                        (table['chembl_ring_systems'] == 1) &
+                        (table['novel'] == 1)
+                    )
+                    for i, (passed, smi) in enumerate(table[['passed_filters', 'representation.smiles']].values):
+                        if passed:
+                            sampled_molecules.append(rdmols[i])
+                            smiles.add(smi)
+                            added_molecules += 1
+                    if args.metrics_output is not None:
+                        metrics.append(table[table['passed_filters']])
+                else:
+                    sampled_molecules.extend(rdmols)
+                    added_molecules = len(rdmols)
+                    if args.metrics_output is not None:
+                        metrics.append(table)
+                pbar.update(added_molecules)
+    # Write results
+    utils.write_sdf_file(args.output, sampled_molecules)
+    if args.metrics_output is not None:
+        metrics = pd.concat(metrics)
+        metrics.to_csv(args.metrics_output, index=False)

src/model/diffusion_utils.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import math
+import torch
+import torch.nn.functional as F
+import numpy as np
+class DistributionNodes:
+    def __init__(self, histogram):
+        histogram = torch.tensor(histogram).float()
+        histogram = histogram + 1e-3  # for numerical stability
+        prob = histogram / histogram.sum()
+        self.idx_to_n_nodes = torch.tensor(
+            [[(i, j) for j in range(prob.shape[1])] for i in range(prob.shape[0])]
+        ).view(-1, 2)
+        self.n_nodes_to_idx = {tuple(x.tolist()): i
+                               for i, x in enumerate(self.idx_to_n_nodes)}
+        self.prob = prob
+        self.m = torch.distributions.Categorical(self.prob.view(-1),
+                                                 validate_args=True)
+        self.n1_given_n2 = \
+            [torch.distributions.Categorical(prob[:, j], validate_args=True)
+             for j in range(prob.shape[1])]
+        self.n2_given_n1 = \
+            [torch.distributions.Categorical(prob[i, :], validate_args=True)
+             for i in range(prob.shape[0])]
+        # entropy = -torch.sum(self.prob.view(-1) * torch.log(self.prob.view(-1) + 1e-30))
+        # entropy = self.m.entropy()
+        # print("Entropy of n_nodes: H[N]", entropy.item())
+    def sample(self, n_samples=1):
+        idx = self.m.sample((n_samples,))
+        num_nodes_lig, num_nodes_pocket = self.idx_to_n_nodes[idx].T
+        return num_nodes_lig, num_nodes_pocket
+    def sample_conditional(self, n1=None, n2=None):
+        assert (n1 is None) ^ (n2 is None), \
+            "Exactly one input argument must be None"
+        m = self.n1_given_n2 if n2 is not None else self.n2_given_n1
+        c = n2 if n2 is not None else n1
+        return torch.tensor([m[i].sample() for i in c], device=c.device)
+    def log_prob(self, batch_n_nodes_1, batch_n_nodes_2):
+        assert len(batch_n_nodes_1.size()) == 1
+        assert len(batch_n_nodes_2.size()) == 1
+        idx = torch.tensor(
+            [self.n_nodes_to_idx[(n1, n2)]
+             for n1, n2 in zip(batch_n_nodes_1.tolist(), batch_n_nodes_2.tolist())]
+        )
+        # log_probs = torch.log(self.prob.view(-1)[idx] + 1e-30)
+        log_probs = self.m.log_prob(idx)
+        return log_probs.to(batch_n_nodes_1.device)
+    def log_prob_n1_given_n2(self, n1, n2):
+        assert len(n1.size()) == 1
+        assert len(n2.size()) == 1
+        log_probs = torch.stack([self.n1_given_n2[c].log_prob(i.cpu())
+                                 for i, c in zip(n1, n2)])
+        return log_probs.to(n1.device)
+    def log_prob_n2_given_n1(self, n2, n1):
+        assert len(n2.size()) == 1
+        assert len(n1.size()) == 1
+        log_probs = torch.stack([self.n2_given_n1[c].log_prob(i.cpu())
+                                 for i, c in zip(n2, n1)])
+        return log_probs.to(n2.device)
+def cosine_beta_schedule_midi(timesteps, s=0.008, nu=1.0, clip=False):
+    """
+    Modified cosine schedule as proposed in https://arxiv.org/abs/2302.09048.
+    Note: we use (t/T)^\nu not (t/T + s)^\nu as written in the MiDi paper
+    We also divide by alphas_cumprod[0] as the original cosine schedule from
+    https://arxiv.org/abs/2102.09672
+    """
+    device = nu.device if torch.is_tensor(nu) else None
+    x = torch.linspace(0, timesteps, timesteps + 1, device=device)
+    alphas_cumprod = torch.cos(0.5 * np.pi * ((x / timesteps)**nu + s) / (1 + s)) ** 2
+    alphas_cumprod = alphas_cumprod / alphas_cumprod[0]
+    if clip:
+        alphas_cumprod = torch.cat([torch.tensor([1.0], device=alphas_cumprod.device), alphas_cumprod])
+        betas = 1 - (alphas_cumprod[1:] / alphas_cumprod[:-1])
+        betas = torch.clip(betas, min=0, max=0.999)
+        alphas = 1. - betas
+        alphas_cumprod = torch.cumprod(alphas, axis=0)
+    return alphas_cumprod
+class CosineSchedule(torch.nn.Module):
+    """
+    nu=1.0 corresponds to the standard cosine schedule
+    """
+    def __init__(self, timesteps, nu=1.0, trainable=False, clip_alpha2_step=0.001):
+        super(CosineSchedule, self).__init__()
+        self.timesteps = timesteps
+        self.trainable = trainable
+        self.nu = nu
+        assert 0.0 <= clip_alpha2_step < 1.0
+        self.clip = clip_alpha2_step
+        if self.trainable:
+            self.nu = torch.nn.Parameter(torch.Tensor([nu]), requires_grad=True)
+        else:
+            self._alpha2 = self.alphas2
+            self._gamma = torch.nn.Parameter(self.gammas, requires_grad=False)
+    @property
+    def alphas2(self):
+        """
+        Cumulative alpha squared.
+        Called alpha_bar in: Nichol, Alexander Quinn, and Prafulla Dhariwal.
+        "Improved denoising diffusion probabilistic models." PMLR, 2021.
+        """
+        if hasattr(self, '_alpha2'):
+            return self._alpha2
+        assert isinstance(self.nu, float) or ~self.nu.isnan()
+        # our alpha is eqivalent to sqrt(alpha) from https://arxiv.org/abs/2102.09672, where the cosine schedule was introduced
+        alphas2 = cosine_beta_schedule_midi(self.timesteps, nu=self.nu, clip=False)
+        # avoid singularities near t=T
+        alphas2 = torch.cat([torch.tensor([1.0], device=alphas2.device), alphas2])
+        alphas2_step = alphas2[1:] / alphas2[:-1]
+        alphas2_step = torch.clip(alphas2_step, min=self.clip, max=1.0)
+        alphas2 = torch.cumprod(alphas2_step, dim=0)
+        return alphas2
+    @property
+    def alphas2_t_given_tminus1(self):
+        """
+        Alphas for a single transition
+        """
+        alphas2 = torch.cat([torch.tensor([1.0]), self.alphas2])
+        return alphas2[1:] / alphas2[:-1]
+    @property
+    def gammas(self):
+        """
+        Gammas as defined in appendix B of the EDM paper
+        gamma_t = -(log alpha_t^2 - log sigma_t^2)
+        """
+        if hasattr(self, '_gamma'):
+            return self._gamma
+        alphas2 = self.alphas2
+        sigmas2 = 1 - alphas2
+        gammas = -(torch.log(alphas2) - torch.log(sigmas2))
+        return gammas.float()
+    def forward(self, t):
+        t_int = torch.round(t * self.timesteps).long()
+        return self.gammas[t_int]
+    @staticmethod
+    def alpha(gamma):
+        """ Computes alpha given gamma. """
+        return torch.sqrt(torch.sigmoid(-gamma))
+    @staticmethod
+    def sigma(gamma):
+        """ Computes sigma given gamma. """
+        return torch.sqrt(torch.sigmoid(gamma))
+    @staticmethod
+    def SNR(gamma):
+        """ Computes signal to noise ratio (alpha^2/sigma^2) given gamma. """
+        return torch.exp(-gamma)
+    def sigma_and_alpha_t_given_s(self, gamma_t: torch.Tensor, gamma_s: torch.Tensor):
+        """
+        Computes sigma_t_given_s, using gamma_t and gamma_s. Used during sampling.
+        These are defined as:
+            alpha_t_given_s = alpha_t / alpha_s,
+            sigma_t_given_s = sqrt(1 - (alpha_t_given_s)^2 ).
+        """
+        sigma2_t_given_s = -torch.expm1(
+            F.softplus(gamma_s) - F.softplus(gamma_t))
+        # alpha_t_given_s = alpha_t / alpha_s
+        log_alpha2_t = F.logsigmoid(-gamma_t)
+        log_alpha2_s = F.logsigmoid(-gamma_s)
+        log_alpha2_t_given_s = log_alpha2_t - log_alpha2_s
+        alpha_t_given_s = torch.exp(0.5 * log_alpha2_t_given_s)
+        alpha_t_given_s = torch.clip(alpha_t_given_s, min=self.clip ** 0.5, max=1.0)
+        sigma_t_given_s = torch.sqrt(sigma2_t_given_s)
+        return sigma2_t_given_s, sigma_t_given_s, alpha_t_given_s

src/model/dpo.py ADDED Viewed

	@@ -0,0 +1,252 @@

+from typing import Optional
+from pathlib import Path
+from contextlib import nullcontext
+import torch
+import torch.nn.functional as F
+from torch_scatter import scatter_mean
+from src.constants import atom_encoder, bond_encoder
+from src.model.lightning import DrugFlow, set_default
+from src.data.dataset import ProcessedLigandPocketDataset, DPODataset
+from src.data.data_utils import AppendVirtualNodesInCoM, Residues, center_data
+class DPO(DrugFlow):
+    def __init__(self, dpo_mode, ref_checkpoint_p, **kwargs):
+        super(DPO, self).__init__(**kwargs)
+        self.dpo_mode = dpo_mode
+        self.dpo_beta = kwargs['loss_params'].dpo_beta if 'dpo_beta' in kwargs['loss_params'] else 100.0
+        self.dpo_beta_schedule = kwargs['loss_params'].dpo_beta_schedule if 'dpo_beta_schedule' in kwargs['loss_params'] else 't'
+        self.clamp_dpo = kwargs['loss_params'].clamp_dpo if 'clamp_dpo' in kwargs['loss_params'] else True
+        self.dpo_lambda_dpo = kwargs['loss_params'].dpo_lambda_dpo if 'dpo_lambda_dpo' in kwargs['loss_params'] else 1
+        self.dpo_lambda_w = kwargs['loss_params'].dpo_lambda_w if 'dpo_lambda_w' in kwargs['loss_params'] else 1
+        self.dpo_lambda_l = kwargs['loss_params'].dpo_lambda_l if 'dpo_lambda_l' in kwargs['loss_params'] else 0.2
+        self.dpo_lambda_h = kwargs['loss_params'].dpo_lambda_h if 'dpo_lambda_h' in kwargs['loss_params'] else kwargs['loss_params'].lambda_h
+        self.dpo_lambda_e = kwargs['loss_params'].dpo_lambda_e if 'dpo_lambda_e' in kwargs['loss_params'] else kwargs['loss_params'].lambda_e
+        self.ref_dynamics = self.init_model(kwargs['predictor_params'])
+        state_dict = torch.load(ref_checkpoint_p)['state_dict']
+        self.ref_dynamics.load_state_dict({k.replace('dynamics.',''): v for k, v in state_dict.items() if k.startswith('dynamics.')})
+        print(f'Loaded reference model from {ref_checkpoint_p}')
+        # initializing model params with ref model params
+        self.dynamics.load_state_dict(self.ref_dynamics.state_dict())
+    def get_dataset(self, stage, pocket_transform=None):
+        # when sampling we don't append virtual nodes as we might need access to the ground truth size
+        if self.virtual_nodes and stage == 'train':
+            ligand_transform = AppendVirtualNodesInCoM(
+                atom_encoder, bond_encoder, add_min=self.add_virtual_min, add_max=self.add_virtual_max)
+        else:
+            ligand_transform = None
+        # we want to know if something goes wrong on the validation or test set
+        catch_errors = stage == 'train'
+        if self.sharded_dataset:
+            raise NotImplementedError('Sharded dataset not implemented for DPO')
+        if self.sample_from_clusters and stage == 'train':  # val/test should be deterministic
+            raise NotImplementedError('Sampling from clusters not implemented for DPO')
+        if stage == 'train':
+            return DPODataset(
+                Path(self.datadir, 'train.pt'),
+                ligand_transform=None,
+                pocket_transform=pocket_transform,
+                catch_errors=True,
+            )
+        else:
+            return ProcessedLigandPocketDataset(
+                pt_path=Path(self.datadir, 'val.pt' if self.debug else f'{stage}.pt'),
+                ligand_transform=ligand_transform,
+                pocket_transform=pocket_transform,
+                catch_errors=catch_errors,
+            )
+    def training_step(self, data, *args):
+        ligand_w, ligand_l, pocket = data['ligand'], data['ligand_l'], data['pocket']
+        loss, info = self.compute_dpo_loss(pocket, ligand_w=ligand_w, ligand_l=ligand_l, return_info=True)
+        if torch.isnan(loss):
+            print(f'For ligand pair , loss is NaN at epoch {self.current_epoch}. Info: {info}')
+        log_dict = {k: v for k, v in info.items() if isinstance(v, float) or torch.numel(v) <= 1}
+        self.log_metrics({'loss': loss, **log_dict}, 'train', batch_size=len(ligand_w['size']))
+        out = {'loss': loss, **info}
+        self.training_step_outputs.append(out)
+        return out
+    def validation_step(self, data, *args):
+        return super().validation_step(data, *args)
+    def compute_dpo_loss(self, pocket, ligand_w, ligand_l, return_info=False):
+        t = torch.rand(ligand_w['size'].size(0), device=ligand_w['x'].device).unsqueeze(-1)
+        if self.dpo_beta_schedule == 't':
+            # from https://arxiv.org/pdf/2407.13981
+            beta_t = (self.dpo_beta * t).squeeze()
+        elif self.dpo_beta_schedule == 'const':
+            beta_t = self.dpo_beta
+        else:
+            raise ValueError(f'Unknown DPO beta schedule: {self.dpo_beta_schedule}')
+        loss_dict_w = self.compute_loss_single_pair(ligand_w, pocket, t)
+        loss_dict_l = self.compute_loss_single_pair(ligand_l, pocket, t)
+        info = {
+            'loss_x_w': loss_dict_w['theta']['x'].mean().item(),
+            'loss_h_w': loss_dict_w['theta']['h'].mean().item(),
+            'loss_e_w': loss_dict_w['theta']['e'].mean().item(),
+            'loss_x_l': loss_dict_l['theta']['x'].mean().item(),
+            'loss_h_l': loss_dict_l['theta']['h'].mean().item(),
+            'loss_e_l': loss_dict_l['theta']['e'].mean().item(),
+        }
+        if self.dpo_mode == 'single_dpo_comp':
+            loss_w_theta = (
+                loss_dict_w['theta']['x'] +
+                self.dpo_lambda_h * loss_dict_w['theta']['h'] +
+                self.dpo_lambda_e * loss_dict_w['theta']['e']
+            )
+            loss_w_ref = (
+                loss_dict_w['ref']['x'] +
+                self.dpo_lambda_h * loss_dict_w['ref']['h'] +
+                self.dpo_lambda_e * loss_dict_w['ref']['e']
+            )
+            loss_l_theta = (
+                loss_dict_l['theta']['x'] +
+                self.dpo_lambda_h * loss_dict_l['theta']['h'] +
+                self.dpo_lambda_e * loss_dict_l['theta']['e']
+            )
+            loss_l_ref = (
+                loss_dict_l['ref']['x'] +
+                self.dpo_lambda_h * loss_dict_l['ref']['h'] +
+                self.dpo_lambda_e * loss_dict_l['ref']['e']
+            )
+            diff_w = loss_w_theta - loss_w_ref
+            diff_l = loss_l_theta - loss_l_ref
+            info['diff_w'] = diff_w.mean().item()
+            info['diff_l'] = diff_l.mean().item()
+            # print(diff)
+            diff = -1 * beta_t * (diff_w - diff_l)
+            loss = -1 * F.logsigmoid(diff)
+        elif self.dpo_mode == 'single_dpo_comp_v3':
+            diff_w_x = loss_dict_w['theta']['x'] - loss_dict_w['ref']['x']
+            diff_w_h = loss_dict_w['theta']['h'] - loss_dict_w['ref']['h']
+            diff_w_e = loss_dict_w['theta']['e'] - loss_dict_w['ref']['e']
+            diff_l_x = loss_dict_l['theta']['x'] - loss_dict_l['ref']['x']
+            diff_l_h = loss_dict_l['theta']['h'] - loss_dict_l['ref']['h']
+            diff_l_e = loss_dict_l['theta']['e'] - loss_dict_l['ref']['e']
+            info['diff_w_x'] = diff_w_x.mean().item()
+            info['diff_w_h'] = diff_w_h.mean().item()
+            info['diff_w_e'] = diff_w_e.mean().item()
+            info['diff_l_x'] = diff_l_x.mean().item()
+            info['diff_l_h'] = diff_l_h.mean().item()
+            info['diff_l_e'] = diff_l_e.mean().item()
+            # not used, just for logging
+            _diff_w = diff_w_x + self.dpo_lambda_h * diff_w_h + self.dpo_lambda_e * diff_w_e
+            _diff_l = diff_l_x + self.dpo_lambda_h * diff_l_h + self.dpo_lambda_e * diff_l_e
+            info['diff_w'] = _diff_w.mean().item()
+            info['diff_l'] = _diff_l.mean().item()
+            diff_x = diff_w_x - diff_l_x
+            diff_h = diff_w_h - diff_l_h
+            diff_e = diff_w_e - diff_l_e
+            info['diff_x'] = diff_x.mean().item()
+            info['diff_h'] = diff_h.mean().item()
+            info['diff_e'] = diff_e.mean().item()
+            diff = -1 * beta_t * (diff_x + self.dpo_lambda_h * diff_h + self.dpo_lambda_e * diff_e)
+            if self.clamp_dpo:
+                diff = diff.clamp(-10, 10)
+            info['dpo_arg_min'] = diff.min().item()
+            info['dpo_arg_max'] = diff.max().item()
+            info['dpo_arg_mean'] = diff.mean().item()
+            dpo_loss = -1 * self.dpo_lambda_dpo * F.logsigmoid(diff)
+            info['dpo_loss'] = dpo_loss.mean().item()
+            loss_w_theta_reg = (
+                loss_dict_w['theta']['x'] +
+                self.lambda_h * loss_dict_w['theta']['h'] +
+                self.lambda_e * loss_dict_w['theta']['e']
+            )
+            info['loss_w_theta_reg'] = loss_w_theta_reg.mean().item()
+            loss_l_theta_reg = (
+                loss_dict_l['theta']['x'] +
+                self.lambda_h * loss_dict_l['theta']['h'] +
+                self.lambda_e * loss_dict_l['theta']['e']
+            )
+            info['loss_l_theta_reg'] = loss_l_theta_reg.mean().item()
+            dpo_reg = self.dpo_lambda_w * loss_w_theta_reg + \
+                      self.dpo_lambda_l * loss_l_theta_reg
+            info['dpo_reg'] = dpo_reg.mean().item()
+            loss = dpo_loss + dpo_reg
+        else:
+            raise ValueError(f'Unknown DPO mode: {self.dpo_mode}')
+        if self.timestep_weights is not None:
+            w_t = self.timestep_weights(t).squeeze()
+            loss = w_t * loss
+        loss = loss.mean(0)
+        print(f'Loss is {loss}, info is {info}')
+        return (loss, info) if return_info else loss
+    def compute_loss_single_pair(self, ligand, pocket, t):
+        pocket = Residues(**pocket)
+        # Center sample
+        ligand, pocket = center_data(ligand, pocket)
+        pocket_com = scatter_mean(pocket['x'], pocket['mask'], dim=0)
+        # Noise
+        z0_x = self.module_x.sample_z0(pocket_com, ligand['mask'])
+        z0_h = self.module_h.sample_z0(ligand['mask'])
+        z0_e = self.module_e.sample_z0(ligand['bond_mask'])
+        zt_x = self.module_x.sample_zt(z0_x, ligand['x'], t, ligand['mask'])
+        zt_h = self.module_h.sample_zt(z0_h, ligand['one_hot'], t, ligand['mask'])
+        zt_e = self.module_e.sample_zt(z0_e, ligand['bond_one_hot'], t, ligand['bond_mask'])
+        # Predict denoising
+        sc_transform = self.get_sc_transform_fn(None, zt_x, t, None, ligand['mask'], pocket)
+        pred_ligand, _ = self.dynamics(
+            zt_x, zt_h, ligand['mask'], pocket, t,
+            bonds_ligand=(ligand['bonds'], zt_e),
+            sc_transform=sc_transform
+        )
+        # Reference model
+        with torch.no_grad():
+            ref_pred_ligand, _ = self.ref_dynamics(
+                zt_x, zt_h, ligand['mask'], pocket, t,
+                bonds_ligand=(ligand['bonds'], zt_e),
+                sc_transform=sc_transform
+            )
+        # Compute L2 loss
+        loss_x = self.module_x.compute_loss(pred_ligand['vel'], z0_x, ligand['x'], t, ligand['mask'], reduce=self.loss_reduce)
+        ref_loss_x = self.module_x.compute_loss(ref_pred_ligand['vel'], z0_x, ligand['x'], t, ligand['mask'], reduce=self.loss_reduce)
+        t_next = torch.clamp(t + self.train_step_size, max=1.0)
+        loss_h = self.module_h.compute_loss(pred_ligand['logits_h'], zt_h, ligand['one_hot'], ligand['mask'], t, t_next, reduce=self.loss_reduce)
+        ref_loss_h = self.module_h.compute_loss(ref_pred_ligand['logits_h'], zt_h, ligand['one_hot'], ligand['mask'], t, t_next, reduce=self.loss_reduce)
+        loss_e = self.module_e.compute_loss(pred_ligand['logits_e'], zt_e, ligand['bond_one_hot'], ligand['bond_mask'], t, t_next, reduce=self.loss_reduce)
+        ref_loss_e = self.module_e.compute_loss(ref_pred_ligand['logits_e'], zt_e, ligand['bond_one_hot'], ligand['bond_mask'], t, t_next, reduce=self.loss_reduce)
+        return {
+            'theta': {
+                'x': loss_x,
+                'h': loss_h,
+                'e': loss_e,
+            },
+            'ref': {
+                'x': ref_loss_x,
+                'h': ref_loss_h,
+                'e': ref_loss_e,
+            }
+        }

src/model/dynamics.py ADDED Viewed

	@@ -0,0 +1,791 @@

+from collections.abc import Iterable
+from abc import abstractmethod
+import random
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from src.constants import INT_TYPE
+from src.model.gvp import GVPModel, GVP, LayerNorm
+from src.model.gvp_transformer import GVPTransformerModel
+from src.constants import FLOAT_TYPE
+from pdb import set_trace
+def binomial_coefficient(n, k):
+    # source: https://discuss.pytorch.org/t/n-choose-k-function/121974
+    return ((n + 1).lgamma() - (k + 1).lgamma() - ((n - k) + 1).lgamma()).exp()
+def cycle_counts(adj):
+    assert (adj.diag() == 0).all()
+    assert (adj == adj.T).all()
+    A = adj.float()
+    d = A.sum(dim=-1)
+    # Compute powers
+    A2 = A @ A
+    A3 = A2 @ A
+    A4 = A3 @ A
+    A5 = A4 @ A
+    x3 = A3.diag() / 2
+    x4 = (A4.diag() - d * (d - 1) - A @ d) / 2
+    """ New (different from DiGress)
+    case where correction is relevant:
+    2   o
+        |
+    1,3 o--o 4
+        | /
+    0,5 o
+    """
+    # Triangle count matrix (indicates for each node i how many triangles it shares with node j)
+    T = adj * A2
+    x5 = (A5.diag() - 2 * T @ d - 4 * d * x3 - 2 * A @ x3 + 10 * x3) / 2
+    # # TODO
+    # A6 = A5 @ A
+    #
+    # # 4-cycle count matrix (indicates in how many shared 4-cycles i and j are 2 hops apart)
+    # Q2 = binomial_coefficient(n=A2 - d.diag(), k=torch.tensor(2))
+    #
+    # # 4-cycle count matrix (indicates in how many shared 4-cycles i and j are 1 (and 3) hop(s) apart)
+    # Q1 = A * (A3 - (d.view(-1, 1) + d.view(1, -1)) + 1)  # "+1" because link between i and j is subtracted twice
+    #
+    # x6 = ...
+    # return torch.stack([x3, x4, x5, x6], dim=-1)
+    return torch.stack([x3, x4, x5], dim=-1)
+# TODO: also consider directional aggregation as in:
+#  Beaini, Dominique, et al. "Directional graph networks."
+#  International Conference on Machine Learning. PMLR, 2021.
+def eigenfeatures(A, batch_mask, k=5):
+    # TODO, see:
+    # - https://github.com/cvignac/DiGress/blob/main/src/diffusion/extra_features.py
+    # - https://arxiv.org/pdf/2209.14734.pdf (Appendix B.2)
+    # split adjacency matrix
+    batch = []
+    for i in torch.unique(batch_mask, sorted=True):  # TODO: optimize (try to avoid loop)
+        batch_inds = torch.where(batch_mask == i)[0]
+        batch.append(A[torch.meshgrid(batch_inds, batch_inds, indexing='ij')])
+    eigenfeats = [get_nontrivial_eigenvectors(adj)[:, :k] for adj in batch]
+    # if there are less than k non-trivial eigenvectors
+    eigenfeats = [torch.cat([
+        x, torch.zeros(x.size(0), max(k - x.size(1), 0), device=x.device)], dim=-1)
+        for x in eigenfeats]
+    return torch.cat(eigenfeats, dim=0)
+def get_nontrivial_eigenvectors(A, normalize_l=True, thresh=1e-5,
+                                norm_eps=1e-12):
+    """
+    Compute eigenvectors of the graph Laplacian corresponding to non-zero
+    eigenvalues.
+    """
+    assert (A == A.T).all(), "undirected graph"
+    # Compute laplacian
+    d = A.sum(-1)
+    D = d.diag()
+    L = D - A
+    if normalize_l:
+        D_inv_sqrt = (1 / (d.sqrt() + norm_eps)).diag()
+        L = D_inv_sqrt @ L @ D_inv_sqrt
+    # Eigendecomposition
+    # eigenvalues are sorted in ascending order
+    # eigvecs matrix contains eigenvectors as its columns
+    eigvals, eigvecs = torch.linalg.eigh(L)
+    # index of first non-trivial eigenvector
+    try:
+        idx = torch.nonzero(eigvals > thresh)[0].item()
+    except IndexError:
+        # recover if no non-trivial eigenvectors are found
+        idx = eigvecs.size(1)
+    return eigvecs[:, idx:]
+class DynamicsBase(nn.Module):
+    """
+    Implements self-conditioning logic and basic functions
+    """
+    def __init__(
+            self,
+            predict_angles=False,
+            predict_frames=False,
+            add_cycle_counts=False,
+            add_spectral_feat=False,
+            self_conditioning=False,
+            augment_residue_sc=False,
+            augment_ligand_sc=False
+    ):
+        super().__init__()
+        if not hasattr(self, 'predict_angles'):
+            self.predict_angles = predict_angles
+        if not hasattr(self, 'predict_frames'):
+            self.predict_frames = predict_frames
+        if not hasattr(self, 'add_cycle_counts'):
+            self.add_cycle_counts = add_cycle_counts
+        if not hasattr(self, 'add_spectral_feat'):
+            self.add_spectral_feat = add_spectral_feat
+        if not hasattr(self, 'self_conditioning'):
+            self.self_conditioning = self_conditioning
+        if not hasattr(self, 'augment_residue_sc'):
+            self.augment_residue_sc = augment_residue_sc
+        if not hasattr(self, 'augment_ligand_sc'):
+            self.augment_ligand_sc = augment_ligand_sc
+        if self.self_conditioning:
+            self.prev_ligand = None
+            self.prev_residues = None
+    @abstractmethod
+    def _forward(self, x_atoms, h_atoms, mask_atoms, pocket, t, bonds_ligand=None,
+                 h_atoms_sc=None, e_atoms_sc=None, h_residues_sc=None):
+        """
+        Implement forward pass.
+        Returns:
+            - vel
+            - h_final_atoms
+            - edge_final_atoms
+            - residue_angles
+            - residue_trans
+            - residue_rot
+        """
+        pass
+    def make_sc_input(self, pred_ligand, pred_residues, sc_transform):
+        if self.predict_confidence:
+            h_atoms_sc = (torch.cat([pred_ligand['logits_h'], pred_ligand['uncertainty_vel'].unsqueeze(1)], dim=-1),
+                          pred_ligand['vel'].unsqueeze(1))
+        else:
+            h_atoms_sc = (pred_ligand['logits_h'], pred_ligand['vel'].unsqueeze(1))
+        e_atoms_sc = pred_ligand['logits_e']
+        if self.predict_frames:
+            h_residues_sc = (torch.cat([pred_residues['chi'], pred_residues['rot']], dim=-1),
+                             pred_residues['trans'].unsqueeze(1))
+        elif self.predict_angles:
+            h_residues_sc = pred_residues['chi']
+        else:
+            h_residues_sc = None
+        if self.augment_residue_sc and h_residues_sc is not None:
+            if self.predict_frames:
+                h_residues_sc = (h_residues_sc[0], torch.cat(
+                    [h_residues_sc[1], sc_transform['residues'](pred_residues['chi'], pred_residues['trans'].squeeze(1), pred_residues['rot'])], dim=1))
+            else:
+                h_residues_sc = (h_residues_sc, sc_transform['residues'](pred_residues['chi']))
+        if self.augment_ligand_sc:
+            h_atoms_sc = (h_atoms_sc[0], torch.cat(
+                [h_atoms_sc[1], sc_transform['atoms'](pred_ligand['vel'].unsqueeze(1))], dim=1))
+        return h_atoms_sc, e_atoms_sc, h_residues_sc
+    def forward(self, x_atoms, h_atoms, mask_atoms, pocket, t, bonds_ligand=None, sc_transform=None):
+        """
+        Implements self-conditioning as in https://arxiv.org/abs/2208.04202
+        """
+        h_atoms_sc, e_atoms_sc = None, None
+        h_residues_sc = None
+        if self.self_conditioning:
+            # Sampling: use previous prediction in all but the first time step
+            if not self.training and t.min() > 0.0:
+                assert t.min() == t.max(), "currently only supports sampling at same time steps"
+                assert self.prev_ligand is not None
+                assert self.prev_residues is not None or not self.predict_frames
+            else:
+                # Create zero tensors
+                zeros_ligand = {'logits_h': torch.zeros_like(h_atoms),
+                                'vel': torch.zeros_like(x_atoms),
+                                'logits_e': torch.zeros_like(bonds_ligand[1])}
+                if self.predict_confidence:
+                    zeros_ligand['uncertainty_vel'] = torch.zeros(
+                        len(x_atoms), dtype=x_atoms.dtype, device=x_atoms.device)
+                zeros_residues = {}
+                if self.predict_angles:
+                    zeros_residues['chi'] = torch.zeros((pocket['one_hot'].size(0), 5), device=pocket['one_hot'].device)
+                if self.predict_frames:
+                    zeros_residues['trans'] = torch.zeros((pocket['one_hot'].size(0), 3), device=pocket['one_hot'].device)
+                    zeros_residues['rot'] = torch.zeros((pocket['one_hot'].size(0), 3), device=pocket['one_hot'].device)
+                # Training: use 50% zeros and 50% predictions with detached gradients
+                if self.training and random.random() > 0.5:
+                    with torch.no_grad():
+                        h_atoms_sc, e_atoms_sc, h_residues_sc = self.make_sc_input(
+                            zeros_ligand, zeros_residues, sc_transform)
+                        self.prev_ligand, self.prev_residues = self._forward(
+                            x_atoms, h_atoms, mask_atoms, pocket, t, bonds_ligand,
+                            h_atoms_sc, e_atoms_sc, h_residues_sc)
+                # use zeros for first sampling step and 50% of training
+                else:
+                    self.prev_ligand = zeros_ligand
+                    self.prev_residues = zeros_residues
+            h_atoms_sc, e_atoms_sc, h_residues_sc = self.make_sc_input(
+                self.prev_ligand, self.prev_residues, sc_transform)
+        pred_ligand, pred_residues = self._forward(
+            x_atoms, h_atoms, mask_atoms, pocket, t, bonds_ligand,
+            h_atoms_sc, e_atoms_sc, h_residues_sc
+        )
+        if self.self_conditioning and not self.training:
+            self.prev_ligand = pred_ligand.copy()
+            self.prev_residues = pred_residues.copy()
+        return pred_ligand, pred_residues
+    def compute_extra_features(self, batch_mask, edge_indices, edge_types):
+        feat = torch.zeros(len(batch_mask), 0, device=batch_mask.device)
+        if not (self.add_cycle_counts or self.add_spectral_feat):
+            return feat
+        adj = batch_mask[:, None] == batch_mask[None, :]
+        E = torch.zeros_like(adj, dtype=INT_TYPE)
+        E[edge_indices[0], edge_indices[1]] = edge_types
+        A = (E > 0).float()
+        if self.add_cycle_counts:
+            cycle_features = cycle_counts(A)
+            cycle_features[cycle_features > 10] = 10  # avoid large values
+            feat = torch.cat([feat, cycle_features], dim=-1)
+        if self.add_spectral_feat:
+            feat = torch.cat([feat, eigenfeatures(A, batch_mask)], dim=-1)
+        return feat
+class Dynamics(DynamicsBase):
+    def __init__(self, atom_nf, residue_nf, joint_nf, bond_dict, pocket_bond_dict,
+                 edge_nf, hidden_nf, act_fn=torch.nn.SiLU(), condition_time=True,
+                 model='egnn', model_params=None,
+                 edge_cutoff_ligand=None, edge_cutoff_pocket=None,
+                 edge_cutoff_interaction=None,
+                 predict_angles=False, predict_frames=False,
+                 add_cycle_counts=False, add_spectral_feat=False,
+                 add_nma_feat=False, self_conditioning=False,
+                 augment_residue_sc=False, augment_ligand_sc=False,
+                 add_chi_as_feature=False, angle_act_fn=False):
+        super().__init__()
+        self.model = model
+        self.edge_cutoff_l = edge_cutoff_ligand
+        self.edge_cutoff_p = edge_cutoff_pocket
+        self.edge_cutoff_i = edge_cutoff_interaction
+        self.hidden_nf = hidden_nf
+        self.predict_angles = predict_angles
+        self.predict_frames = predict_frames
+        self.bond_dict = bond_dict
+        self.pocket_bond_dict = pocket_bond_dict
+        self.bond_nf = len(bond_dict)
+        self.pocket_bond_nf = len(pocket_bond_dict)
+        self.edge_nf = edge_nf
+        self.add_cycle_counts = add_cycle_counts
+        self.add_spectral_feat = add_spectral_feat
+        self.add_nma_feat = add_nma_feat
+        self.self_conditioning = self_conditioning
+        self.augment_residue_sc = augment_residue_sc
+        self.augment_ligand_sc = augment_ligand_sc
+        self.add_chi_as_feature = add_chi_as_feature
+        self.predict_confidence = False
+        if self.self_conditioning:
+            self.prev_vel = None
+            self.prev_h = None
+            self.prev_e = None
+            self.prev_a = None
+            self.prev_ca = None
+            self.prev_rot = None
+        lig_nf = atom_nf
+        if self.add_cycle_counts:
+            lig_nf = lig_nf + 3
+        if self.add_spectral_feat:
+            lig_nf = lig_nf + 5
+        if not isinstance(joint_nf, Iterable):
+            # joint_nf contains only scalars
+            joint_nf = (joint_nf, 0)
+        if isinstance(residue_nf, Iterable):
+            _atom_in_nf = (lig_nf, 0)
+            _residue_atom_dim = residue_nf[1]
+            if self.add_nma_feat:
+                residue_nf = (residue_nf[0], residue_nf[1] + 5)
+            if self.self_conditioning:
+                _atom_in_nf = (_atom_in_nf[0] + atom_nf, 1)
+                if self.augment_ligand_sc:
+                    _atom_in_nf = (_atom_in_nf[0], _atom_in_nf[1] + 1)
+                if self.predict_angles:
+                    residue_nf = (residue_nf[0] + 5, residue_nf[1])
+                if self.predict_frames:
+                    residue_nf = (residue_nf[0], residue_nf[1] + 2)
+                if self.augment_residue_sc:
+                    assert self.predict_angles
+                    residue_nf = (residue_nf[0], residue_nf[1] + _residue_atom_dim)
+            if self.add_chi_as_feature:
+                residue_nf = (residue_nf[0] + 5, residue_nf[1])
+            self.atom_encoder = nn.Sequential(
+                GVP(_atom_in_nf, joint_nf, activations=(act_fn, torch.sigmoid)),
+                LayerNorm(joint_nf, learnable_vector_weight=True),
+                GVP(joint_nf, joint_nf, activations=(None, None)),
+            )
+            self.residue_encoder = nn.Sequential(
+                GVP(residue_nf, joint_nf, activations=(act_fn, torch.sigmoid)),
+                LayerNorm(joint_nf, learnable_vector_weight=True),
+                GVP(joint_nf, joint_nf, activations=(None, None)),
+            )
+        else:
+            # No vector-valued input features
+            assert joint_nf[1] == 0
+            # self-conditioning not yet supported
+            assert not self.self_conditioning
+            # Normal mode features are vectors
+            assert not self.add_nma_feat
+            if self.add_chi_as_feature:
+                residue_nf += 5
+            self.atom_encoder = nn.Sequential(
+                nn.Linear(lig_nf, 2 * atom_nf),
+                act_fn,
+                nn.Linear(2 * atom_nf, joint_nf[0])
+            )
+            self.residue_encoder = nn.Sequential(
+                nn.Linear(residue_nf, 2 * residue_nf),
+                act_fn,
+                nn.Linear(2 * residue_nf, joint_nf[0])
+            )
+        self.atom_decoder = nn.Sequential(
+            nn.Linear(joint_nf[0], 2 * atom_nf),
+            act_fn,
+            nn.Linear(2 * atom_nf, atom_nf)
+        )
+        self.edge_decoder = nn.Sequential(
+            nn.Linear(hidden_nf, hidden_nf),
+            act_fn,
+            nn.Linear(hidden_nf, self.bond_nf)
+        )
+        _atom_bond_nf = 2 * self.bond_nf if self.self_conditioning else self.bond_nf
+        self.ligand_bond_encoder = nn.Sequential(
+            nn.Linear(_atom_bond_nf, hidden_nf),
+            act_fn,
+            nn.Linear(hidden_nf, self.edge_nf)
+        )
+        self.pocket_bond_encoder = nn.Sequential(
+            nn.Linear(self.pocket_bond_nf, hidden_nf),
+            act_fn,
+            nn.Linear(hidden_nf, self.edge_nf)
+        )
+        out_nf = (joint_nf[0], 1)
+        res_out_nf = (0, 0)
+        if self.predict_angles:
+            res_out_nf = (res_out_nf[0] + 5, res_out_nf[1])
+        if self.predict_frames:
+            res_out_nf = (res_out_nf[0], res_out_nf[1] + 2)
+        self.residue_decoder = nn.Sequential(
+            GVP(out_nf, out_nf, activations=(act_fn, torch.sigmoid)),
+            LayerNorm(out_nf, learnable_vector_weight=True),
+            GVP(out_nf, res_out_nf, activations=(None, None)),
+        ) if res_out_nf != (0, 0) else None
+        if angle_act_fn is None:
+            self.angle_act_fn = None
+        elif angle_act_fn == 'tanh':
+            self.angle_act_fn = lambda x: np.pi * F.tanh(x)
+        else:
+            raise NotImplementedError(f"Angle activation {angle_act_fn} not available")
+        # self.ligand_nobond_emb = nn.Parameter(torch.zeros(self.edge_nf))
+        # self.pocket_nobond_emb = nn.Parameter(torch.zeros(self.edge_nf))
+        self.cross_emb = nn.Parameter(torch.zeros(self.edge_nf),
+                                      requires_grad=True)
+        if condition_time:
+            dynamics_node_nf = (joint_nf[0] + 1, joint_nf[1])
+        else:
+            print('Warning: dynamics model is NOT conditioned on time.')
+            dynamics_node_nf = (joint_nf[0], joint_nf[1])
+        if model == 'egnn':
+            raise NotImplementedError
+            # self.net = EGNN(
+            #     in_node_nf=dynamics_node_nf[0], in_edge_nf=self.edge_nf,
+            #     hidden_nf=hidden_nf, out_node_nf=joint_nf[0],
+            #     device=model_params.device, act_fn=act_fn,
+            #     n_layers=model_params.n_layers,
+            #     attention=model_params.attention,
+            #     tanh=model_params.tanh,
+            #     norm_constant=model_params.norm_constant,
+            #     inv_sublayers=model_params.inv_sublayers,
+            #     sin_embedding=model_params.sin_embedding,
+            #     normalization_factor=model_params.normalization_factor,
+            #     aggregation_method=model_params.aggregation_method,
+            #     reflection_equiv=model_params.reflection_equivariant,
+            #     update_edge_attr=True
+            # )
+            # self.node_nf = dynamics_node_nf[0]
+        elif model == 'gvp':
+            self.net = GVPModel(
+                node_in_dim=dynamics_node_nf, node_h_dim=model_params.node_h_dim,
+                node_out_nf=joint_nf[0], edge_in_nf=self.edge_nf,
+                edge_h_dim=model_params.edge_h_dim, edge_out_nf=hidden_nf,
+                num_layers=model_params.n_layers,
+                drop_rate=model_params.dropout,
+                vector_gate=model_params.vector_gate,
+                reflection_equiv=model_params.reflection_equivariant,
+                d_max=model_params.d_max,
+                num_rbf=model_params.num_rbf,
+                update_edge_attr=True
+            )
+        elif model == 'gvp_transformer':
+            self.net = GVPTransformerModel(
+                node_in_dim=dynamics_node_nf,
+                node_h_dim=model_params.node_h_dim,
+                node_out_nf=joint_nf[0],
+                edge_in_nf=self.edge_nf,
+                edge_h_dim=model_params.edge_h_dim,
+                edge_out_nf=hidden_nf,
+                num_layers=model_params.n_layers,
+                dk=model_params.dk,
+                dv=model_params.dv,
+                de=model_params.de,
+                db=model_params.db,
+                dy=model_params.dy,
+                attn_heads=model_params.attn_heads,
+                n_feedforward=model_params.n_feedforward,
+                drop_rate=model_params.dropout,
+                reflection_equiv=model_params.reflection_equivariant,
+                d_max=model_params.d_max,
+                num_rbf=model_params.num_rbf,
+                vector_gate=model_params.vector_gate,
+                attention=model_params.attention,
+            )
+        elif model == 'gnn':
+            raise NotImplementedError
+            # n_dims = 3
+            # self.net = GNN(
+            #     in_node_nf=dynamics_node_nf + n_dims, in_edge_nf=self.edge_emb_dim,
+            #     hidden_nf=hidden_nf, out_node_nf=n_dims + dynamics_node_nf,
+            #     device=model_params.device, act_fn=act_fn, n_layers=model_params.n_layers,
+            #     attention=model_params.attention, normalization_factor=model_params.normalization_factor,
+            #     aggregation_method=model_params.aggregation_method)
+        else:
+            raise NotImplementedError(f"{model} is not available")
+        # self.device = device
+        # self.n_dims = n_dims
+        self.condition_time = condition_time
+    def _forward(self, x_atoms, h_atoms, mask_atoms, pocket, t, bonds_ligand=None,
+                h_atoms_sc=None, e_atoms_sc=None, h_residues_sc=None):
+        """
+        :param x_atoms:
+        :param h_atoms:
+        :param mask_atoms:
+        :param pocket: must contain keys: 'x', 'one_hot', 'mask', 'bonds' and 'bond_one_hot'
+        :param t:
+        :param bonds_ligand: tuple - bond indices (2, n_bonds) & bond types (n_bonds, bond_nf)
+        :param h_atoms_sc: additional node feature for self-conditioning, (s, V)
+        :param e_atoms_sc: additional edge feature for self-conditioning, only scalar
+        :param h_residues_sc: additional node feature for self-conditioning, tensor or tuple
+        :return:
+        """
+        x_residues, h_residues, mask_residues = pocket['x'], pocket['one_hot'], pocket['mask']
+        if 'bonds' in pocket:
+            bonds_pocket = (pocket['bonds'], pocket['bond_one_hot'])
+        else:
+            bonds_pocket = None
+        if self.add_chi_as_feature:
+            h_residues = torch.cat([h_residues, pocket['chi'][:, :5]], dim=-1)
+        if 'v' in pocket:
+            v_residues = pocket['v']
+            if self.add_nma_feat:
+                v_residues = torch.cat([v_residues, pocket['nma_vec']], dim=1)
+            h_residues = (h_residues, v_residues)
+        if h_residues_sc is not None:
+            # if self.augment_residue_sc:
+            if isinstance(h_residues_sc, tuple):
+                h_residues = (torch.cat([h_residues[0], h_residues_sc[0]], dim=-1),
+                              torch.cat([h_residues[1], h_residues_sc[1]], dim=1))
+            else:
+                h_residues = (torch.cat([h_residues[0], h_residues_sc], dim=-1),
+                              h_residues[1])
+        # get graph edges and edge attributes
+        if bonds_ligand is not None:
+            # NOTE: 'bond' denotes one-directional edges and 'edge' means bi-directional
+            ligand_bond_indices = bonds_ligand[0]
+            # make sure messages are passed both ways
+            ligand_edge_indices = torch.cat(
+                [bonds_ligand[0], bonds_ligand[0].flip(dims=[0])], dim=1)
+            ligand_edge_types = torch.cat([bonds_ligand[1], bonds_ligand[1]], dim=0)
+            # edges_ligand = (ligand_edge_indices, ligand_edge_types)
+            # add auxiliary features to ligand nodes
+            extra_features = self.compute_extra_features(
+                mask_atoms, ligand_edge_indices, ligand_edge_types.argmax(-1))
+            h_atoms = torch.cat([h_atoms, extra_features], dim=-1)
+        if bonds_pocket is not None:
+            # make sure messages are passed both ways
+            pocket_edge_indices = torch.cat(
+                [bonds_pocket[0], bonds_pocket[0].flip(dims=[0])], dim=1)
+            pocket_edge_types = torch.cat([bonds_pocket[1], bonds_pocket[1]], dim=0)
+            # edges_pocket = (pocket_edge_indices, pocket_edge_types)
+        if h_atoms_sc is not None:
+            h_atoms = (torch.cat([h_atoms, h_atoms_sc[0]], dim=-1),
+                       h_atoms_sc[1])
+        if e_atoms_sc is not None:
+            e_atoms_sc = torch.cat([e_atoms_sc, e_atoms_sc], dim=0)
+            ligand_edge_types = torch.cat([ligand_edge_types, e_atoms_sc], dim=-1)
+        # embed atom features and residue features in a shared space
+        h_atoms = self.atom_encoder(h_atoms)
+        e_ligand = self.ligand_bond_encoder(ligand_edge_types)
+        if len(h_residues) > 0:
+            h_residues = self.residue_encoder(h_residues)
+            e_pocket = self.pocket_bond_encoder(pocket_edge_types)
+        else:
+            e_pocket = pocket_edge_types
+            h_residues = (h_residues, h_residues)
+            pocket_edge_indices = torch.tensor([[], []], dtype=torch.long, device=h_residues[0].device)
+            pocket_edge_types = torch.tensor([[], []], dtype=torch.long, device=h_residues[0].device)
+        if isinstance(h_atoms, tuple):
+            h_atoms, v_atoms = h_atoms
+            h_residues, v_residues = h_residues
+            v = torch.cat((v_atoms, v_residues), dim=0)
+        else:
+            v = None
+        edges, edge_feat = self.get_edges(
+            mask_atoms, mask_residues, x_atoms, x_residues,
+            bond_inds_ligand=ligand_edge_indices, bond_inds_pocket=pocket_edge_indices,
+            bond_feat_ligand=e_ligand, bond_feat_pocket=e_pocket)
+        # combine the two node types
+        x = torch.cat((x_atoms, x_residues), dim=0)
+        h = torch.cat((h_atoms, h_residues), dim=0)
+        mask = torch.cat([mask_atoms, mask_residues])
+        if self.condition_time:
+            if np.prod(t.size()) == 1:
+                # t is the same for all elements in batch.
+                h_time = torch.empty_like(h[:, 0:1]).fill_(t.item())
+            else:
+                # t is different over the batch dimension.
+                h_time = t[mask]
+            h = torch.cat([h, h_time], dim=1)
+        assert torch.all(mask[edges[0]] == mask[edges[1]])
+        if self.model == 'egnn':
+            # Don't update pocket coordinates
+            update_coords_mask = torch.cat((torch.ones_like(mask_atoms),
+                                            torch.zeros_like(mask_residues))).unsqueeze(1)
+            h_final, vel, edge_final = self.net(
+                h, x, edges,  batch_mask=mask, edge_attr=edge_feat,
+                update_coords_mask=update_coords_mask)
+            # vel = (x_final - x)
+        elif self.model == 'gvp' or self.model == 'gvp_transformer':
+            h_final, vel, edge_final = self.net(
+                h, x, edges, v=v, batch_mask=mask, edge_attr=edge_feat)
+        elif self.model == 'gnn':
+            xh = torch.cat([x, h], dim=1)
+            output = self.net(xh, edges, node_mask=None, edge_attr=edge_feat)
+            vel = output[:, :3]
+            h_final = output[:, 3:]
+        else:
+            raise NotImplementedError(f"Wrong model ({self.model})")
+        # if self.condition_time:
+        #     # Slice off last dimension which represented time.
+        #     h_final = h_final[:, :-1]
+        # decode atom and residue features
+        h_final_atoms = self.atom_decoder(h_final[:len(mask_atoms)])
+        if torch.any(torch.isnan(vel)) or torch.any(torch.isnan(h_final_atoms)):
+            if self.training:
+                vel[torch.isnan(vel)] = 0.0
+                h_final_atoms[torch.isnan(h_final_atoms)] = 0.0
+            else:
+                raise ValueError("NaN detected in network output")
+        # predict edge type
+        ligand_edge_mask = (edges[0] < len(mask_atoms)) & (edges[1] < len(mask_atoms))
+        edge_final = edge_final[ligand_edge_mask]
+        edges = edges[:, ligand_edge_mask]
+        # Symmetrize
+        edge_logits = torch.zeros(
+            (len(mask_atoms), len(mask_atoms), self.hidden_nf),
+            device=mask_atoms.device)
+        edge_logits[edges[0], edges[1]] = edge_final
+        edge_logits = (edge_logits + edge_logits.transpose(0, 1)) * 0.5
+        # edge_logits = edge_logits[lig_edge_indices[0], lig_edge_indices[1]]
+        # return upper triangular elements only (matching the input)
+        edge_logits = edge_logits[ligand_bond_indices[0], ligand_bond_indices[1]]
+        # assert (edge_logits == 0).sum() == 0
+        edge_final_atoms = self.edge_decoder(edge_logits)
+        # Predict torsion angles
+        residue_angles = None
+        residue_trans, residue_rot = None, None
+        if self.residue_decoder is not None:
+            h_residues = h_final[len(mask_atoms):]
+            vec_residues = vel[len(mask_atoms):].unsqueeze(1)
+            residue_angles = self.residue_decoder((h_residues, vec_residues))
+            if self.predict_frames:
+                residue_angles, residue_frames = residue_angles
+                residue_trans = residue_frames[:, 0, :].squeeze(1)
+                residue_rot = residue_frames[:, 1, :].squeeze(1)
+            if self.angle_act_fn is not None:
+                residue_angles = self.angle_act_fn(residue_angles)
+        # return vel[:len(mask_atoms)], h_final_atoms, edge_final_atoms, residue_angles, residue_trans, residue_rot
+        pred_ligand = {'vel': vel[:len(mask_atoms)], 'logits_h': h_final_atoms, 'logits_e': edge_final_atoms}
+        pred_residues = {'chi': residue_angles, 'trans': residue_trans, 'rot': residue_rot}
+        return pred_ligand, pred_residues
+    def get_edges(self, batch_mask_ligand, batch_mask_pocket, x_ligand,
+                  x_pocket, bond_inds_ligand=None, bond_inds_pocket=None,
+                  bond_feat_ligand=None, bond_feat_pocket=None, self_edges=False):
+        # Adjacency matrix
+        adj_ligand = batch_mask_ligand[:, None] == batch_mask_ligand[None, :]
+        adj_pocket = batch_mask_pocket[:, None] == batch_mask_pocket[None, :]
+        adj_cross = batch_mask_ligand[:, None] == batch_mask_pocket[None, :]
+        if self.edge_cutoff_l is not None:
+            adj_ligand = adj_ligand & (torch.cdist(x_ligand, x_ligand) <= self.edge_cutoff_l)
+            # Add missing bonds if they got removed
+            adj_ligand[bond_inds_ligand[0], bond_inds_ligand[1]] = True
+        if self.edge_cutoff_p is not None and len(x_pocket) > 0:
+            adj_pocket = adj_pocket & (torch.cdist(x_pocket, x_pocket) <= self.edge_cutoff_p)
+            # Add missing bonds if they got removed
+            adj_pocket[bond_inds_pocket[0], bond_inds_pocket[1]] = True
+        if self.edge_cutoff_i is not None and len(x_pocket) > 0:
+            adj_cross = adj_cross & (torch.cdist(x_ligand, x_pocket) <= self.edge_cutoff_i)
+        adj = torch.cat((torch.cat((adj_ligand, adj_cross), dim=1),
+                         torch.cat((adj_cross.T, adj_pocket), dim=1)), dim=0)
+        if not self_edges:
+            adj = adj ^ torch.eye(*adj.size(), out=torch.empty_like(adj))
+        # # ensure that edge definition is consistent if bonds are provided (for loss computation)
+        # if bond_inds_ligand is not None:
+        #     # remove ligand edges
+        #     adj[:adj_ligand.size(0), :adj_ligand.size(1)] = False
+        #     edges = torch.stack(torch.where(adj), dim=0)
+        #     # add ligand edges back with original definition
+        #     edges = torch.cat([bond_inds_ligand, edges], dim=-1)
+        # else:
+        #     edges = torch.stack(torch.where(adj), dim=0)
+        # Feature matrix
+        ligand_nobond_onehot = F.one_hot(torch.tensor(
+            self.bond_dict['NOBOND'], device=bond_feat_ligand.device),
+            num_classes=self.ligand_bond_encoder[0].in_features)
+        ligand_nobond_emb = self.ligand_bond_encoder(
+            ligand_nobond_onehot.to(FLOAT_TYPE))
+        feat_ligand = ligand_nobond_emb.repeat(*adj_ligand.shape, 1)
+        feat_ligand[bond_inds_ligand[0], bond_inds_ligand[1]] = bond_feat_ligand
+        if len(adj_pocket) > 0:
+            pocket_nobond_onehot = F.one_hot(torch.tensor(
+                self.pocket_bond_dict['NOBOND'], device=bond_feat_pocket.device),
+                num_classes=self.pocket_bond_nf)
+            pocket_nobond_emb = self.pocket_bond_encoder(
+                pocket_nobond_onehot.to(FLOAT_TYPE))
+            feat_pocket = pocket_nobond_emb.repeat(*adj_pocket.shape, 1)
+            feat_pocket[bond_inds_pocket[0], bond_inds_pocket[1]] = bond_feat_pocket
+            feat_cross = self.cross_emb.repeat(*adj_cross.shape, 1)
+            feats = torch.cat((torch.cat((feat_ligand, feat_cross), dim=1),
+                               torch.cat((feat_cross.transpose(0, 1), feat_pocket), dim=1)), dim=0)
+        else:
+            feats = feat_ligand
+        # Return results
+        edges = torch.stack(torch.where(adj), dim=0)
+        edge_feat = feats[edges[0], edges[1]]
+        return edges, edge_feat

src/model/dynamics_hetero.py ADDED Viewed

	@@ -0,0 +1,1008 @@

+from collections.abc import Iterable
+from collections import defaultdict
+from functools import partial
+import functools
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from torch_scatter import scatter_mean
+from torch_geometric.nn import MessagePassing
+from torch_geometric.nn.module_dict import ModuleDict
+from torch_geometric.utils.hetero import check_add_self_loops
+try:
+    from torch_geometric.nn.conv.hgt_conv import group
+except ImportError as e:
+    from torch_geometric.nn.conv.hetero_conv import group
+from src.model.dynamics import DynamicsBase
+from src.model import gvp
+from src.model.gvp import GVP, _rbf, _normalize, tuple_index, tuple_sum, _split, tuple_cat, _merge
+class MyModuleDict(nn.ModuleDict):
+    def __init__(self, modules):
+        # a mapping (dictionary) of (string: module) or an iterable of key-value pairs of type (string, module)
+        if isinstance(modules, dict):
+            super().__init__({str(k): v for k, v in modules.items()})
+        else:
+            raise NotImplementedError
+    def __getitem__(self, key):
+        return super().__getitem__(str(key))
+    def __setitem__(self, key, value):
+        super().__setitem__(str(key), value)
+    def __delitem__(self, key):
+        super().__delitem__(str(key))
+class MyHeteroConv(nn.Module):
+    """
+    Implementation from PyG 2.2.0 with minor changes.
+    Override forward pass to control the final aggregation
+    Ref.: https://pytorch-geometric.readthedocs.io/en/2.2.0/_modules/torch_geometric/nn/conv/hetero_conv.html
+    """
+    def __init__(self, convs, aggr="sum"):
+        self.vo = {}
+        for k, module in convs.items():
+            dst = k[-1]
+            if dst not in self.vo:
+                self.vo[dst] = module.vo
+            else:
+                assert self.vo[dst] == module.vo
+        # from the original implementation in PyTorch Geometric
+        super().__init__()
+        for edge_type, module in convs.items():
+            check_add_self_loops(module, [edge_type])
+        src_node_types = set([key[0] for key in convs.keys()])
+        dst_node_types = set([key[-1] for key in convs.keys()])
+        if len(src_node_types - dst_node_types) > 0:
+            warnings.warn(
+                f"There exist node types ({src_node_types - dst_node_types}) "
+                f"whose representations do not get updated during message "
+                f"passing as they do not occur as destination type in any "
+                f"edge type. This may lead to unexpected behaviour.")
+        self.convs = ModuleDict({'__'.join(k): v for k, v in convs.items()})
+        self.aggr = aggr
+    def reset_parameters(self):
+        for conv in self.convs.values():
+            conv.reset_parameters()
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}(num_relations={len(self.convs)})'
+    def forward(
+            self,
+            x_dict,
+            edge_index_dict,
+            *args_dict,
+            **kwargs_dict,
+    ):
+        r"""
+        Args:
+            x_dict (Dict[str, Tensor]): A dictionary holding node feature
+                information for each individual node type.
+            edge_index_dict (Dict[Tuple[str, str, str], Tensor]): A dictionary
+                holding graph connectivity information for each individual
+                edge type.
+            *args_dict (optional): Additional forward arguments of invididual
+                :class:`torch_geometric.nn.conv.MessagePassing` layers.
+            **kwargs_dict (optional): Additional forward arguments of
+                individual :class:`torch_geometric.nn.conv.MessagePassing`
+                layers.
+                For example, if a specific GNN layer at edge type
+                :obj:`edge_type` expects edge attributes :obj:`edge_attr` as a
+                forward argument, then you can pass them to
+                :meth:`~torch_geometric.nn.conv.HeteroConv.forward` via
+                :obj:`edge_attr_dict = { edge_type: edge_attr }`.
+        """
+        out_dict = defaultdict(list)
+        out_dict_edge = {}
+        for edge_type, edge_index in edge_index_dict.items():
+            src, rel, dst = edge_type
+            str_edge_type = '__'.join(edge_type)
+            if str_edge_type not in self.convs:
+                continue
+            args = []
+            for value_dict in args_dict:
+                if edge_type in value_dict:
+                    args.append(value_dict[edge_type])
+                elif src == dst and src in value_dict:
+                    args.append(value_dict[src])
+                elif src in value_dict or dst in value_dict:
+                    args.append(
+                        (value_dict.get(src, None), value_dict.get(dst, None)))
+            kwargs = {}
+            for arg, value_dict in kwargs_dict.items():
+                arg = arg[:-5]  # `{*}_dict`
+                if edge_type in value_dict:
+                    kwargs[arg] = value_dict[edge_type]
+                elif src == dst and src in value_dict:
+                    kwargs[arg] = value_dict[src]
+                elif src in value_dict or dst in value_dict:
+                    kwargs[arg] = (value_dict.get(src, None),
+                                   value_dict.get(dst, None))
+            conv = self.convs[str_edge_type]
+            if src == dst:
+                out = conv(x_dict[src], edge_index, *args, **kwargs)
+            else:
+                out = conv((x_dict[src], x_dict[dst]), edge_index, *args,
+                           **kwargs)
+            if isinstance(out, (tuple, list)):
+                out, out_edge = out
+                out_dict_edge[edge_type] = out_edge
+            out_dict[dst].append(out)
+        for key, value in out_dict.items():
+            out_dict[key] = group(value, self.aggr)
+            out_dict[key] = _split(out_dict[key], self.vo[key])
+        return out_dict if len(out_dict_edge) <= 0 else out_dict, out_dict_edge
+class GVPHeteroConv(MessagePassing):
+    '''
+    Graph convolution / message passing with Geometric Vector Perceptrons.
+    Takes in a graph with node and edge embeddings,
+    and returns new node embeddings.
+    This does NOT do residual updates and pointwise feedforward layers
+    ---see `GVPConvLayer`.
+    :param in_dims: input node embedding dimensions (n_scalar, n_vector)
+    :param out_dims: output node embedding dimensions (n_scalar, n_vector)
+    :param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
+    :param n_layers: number of GVPs in the message function
+    :param module_list: preconstructed message function, overrides n_layers
+    :param aggr: should be "add" if some incoming edges are masked, as in
+                 a masked autoregressive decoder architecture, otherwise "mean"
+    :param activations: tuple of functions (scalar_act, vector_act) to use in GVPs
+    :param vector_gate: whether to use vector gating.
+                        (vector_act will be used as sigma^+ in vector gating if `True`)
+    :param update_edge_attr: whether to compute an updated edge representation
+    '''
+    def __init__(self, in_dims, out_dims, edge_dims, in_dims_other=None,
+                 n_layers=3, module_list=None, aggr="mean",
+                 activations=(F.relu, torch.sigmoid), vector_gate=False,
+                 update_edge_attr=False):
+        super(GVPHeteroConv, self).__init__(aggr=aggr)
+        if in_dims_other is None:
+            in_dims_other = in_dims
+        self.si, self.vi = in_dims
+        self.si_other, self.vi_other = in_dims_other
+        self.so, self.vo = out_dims
+        self.se, self.ve = edge_dims
+        self.update_edge_attr = update_edge_attr
+        GVP_ = functools.partial(GVP,
+                                 activations=activations,
+                                 vector_gate=vector_gate)
+        def get_modules(module_list, out_dims):
+            module_list = module_list or []
+            if not module_list:
+                if n_layers == 1:
+                    module_list.append(
+                        GVP_((self.si + self.si_other + self.se, self.vi + self.vi_other + self.ve),
+                             (self.so, self.vo), activations=(None, None)))
+                else:
+                    module_list.append(
+                        GVP_((self.si + self.si_other + self.se, self.vi + self.vi_other + self.ve),
+                             out_dims)
+                    )
+                    for i in range(n_layers - 2):
+                        module_list.append(GVP_(out_dims, out_dims))
+                    module_list.append(GVP_(out_dims, out_dims,
+                                            activations=(None, None)))
+            return nn.Sequential(*module_list)
+        self.message_func = get_modules(module_list, out_dims)
+        self.edge_func = get_modules(module_list, edge_dims) if self.update_edge_attr else None
+    def forward(self, x, edge_index, edge_attr):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`
+        :param edge_index: array of shape [2, n_edges]
+        :param edge_attr: tuple (s, V) of `torch.Tensor`
+        '''
+        elem_0, elem_1 = x
+        if isinstance(elem_0, (tuple, list)):
+            assert isinstance(elem_1, (tuple, list))
+            x_s = (elem_0[0], elem_1[0])
+            x_v = (elem_0[1].reshape(elem_0[1].shape[0], 3 * elem_0[1].shape[1]),
+                   elem_1[1].reshape(elem_1[1].shape[0], 3 * elem_1[1].shape[1]))
+        else:
+            x_s, x_v = elem_0, elem_1
+            x_v = x_v.reshape(x_v.shape[0], 3 * x_v.shape[1])
+        message = self.propagate(edge_index, s=x_s, v=x_v, edge_attr=edge_attr)
+        if self.update_edge_attr:
+            if isinstance(x_s, (tuple, list)):
+                s_i, s_j = x_s[1][edge_index[1]], x_s[0][edge_index[0]]
+            else:
+                s_i, s_j = x_s[edge_index[1]], x_s[edge_index[0]]
+            if isinstance(x_v, (tuple, list)):
+                v_i, v_j = x_v[1][edge_index[1]], x_v[0][edge_index[0]]
+            else:
+                v_i, v_j = x_v[edge_index[1]], x_v[edge_index[0]]
+            edge_out = self.edge_attr(s_i, v_i, s_j, v_j, edge_attr)
+            # return _split(message, self.vo), edge_out
+            return message, edge_out
+        else:
+            # return _split(message, self.vo)
+            return message
+    def message(self, s_i, v_i, s_j, v_j, edge_attr):
+        v_j = v_j.view(v_j.shape[0], v_j.shape[1] // 3, 3)
+        v_i = v_i.view(v_i.shape[0], v_i.shape[1] // 3, 3)
+        message = tuple_cat((s_j, v_j), edge_attr, (s_i, v_i))
+        message = self.message_func(message)
+        return _merge(*message)
+    def edge_attr(self, s_i, v_i, s_j, v_j, edge_attr):
+        v_j = v_j.view(v_j.shape[0], v_j.shape[1] // 3, 3)
+        v_i = v_i.view(v_i.shape[0], v_i.shape[1] // 3, 3)
+        message = tuple_cat((s_j, v_j), edge_attr, (s_i, v_i))
+        return self.edge_func(message)
+class GVPHeteroConvLayer(nn.Module):
+    """
+    Full graph convolution / message passing layer with
+    Geometric Vector Perceptrons. Residually updates node embeddings with
+    aggregated incoming messages, applies a pointwise feedforward
+    network to node embeddings, and returns updated node embeddings.
+    To only compute the aggregated messages, see `GVPConv`.
+    :param conv_dims: dictionary defining (src_dim, dst_dim, edge_dim) for each edge type
+    """
+    def __init__(self, conv_dims,
+                 n_message=3, n_feedforward=2, drop_rate=.1,
+                 activations=(F.relu, torch.sigmoid), vector_gate=False,
+                 update_edge_attr=False, ln_vector_weight=False):
+        super(GVPHeteroConvLayer, self).__init__()
+        self.update_edge_attr = update_edge_attr
+        gvp_conv = partial(GVPHeteroConv,
+                           n_layers=n_message,
+                           aggr="sum",
+                           activations=activations,
+                           vector_gate=vector_gate,
+                           update_edge_attr=update_edge_attr)
+        def get_feedforward(n_dims):
+            GVP_ = partial(GVP, activations=activations, vector_gate=vector_gate)
+            ff_func = []
+            if n_feedforward == 1:
+                ff_func.append(GVP_(n_dims, n_dims, activations=(None, None)))
+            else:
+                hid_dims = 4 * n_dims[0], 2 * n_dims[1]
+                ff_func.append(GVP_(n_dims, hid_dims))
+                for i in range(n_feedforward - 2):
+                    ff_func.append(GVP_(hid_dims, hid_dims))
+                ff_func.append(GVP_(hid_dims, n_dims, activations=(None, None)))
+            return nn.Sequential(*ff_func)
+        # self.conv = HeteroConv({k: gvp_conv(*dims) for k, dims in conv_dims.items()}, aggr='sum')
+        self.conv = MyHeteroConv({k: gvp_conv(*dims) for k, dims in conv_dims.items()}, aggr='sum')
+        node_dims = {k[-1]: dims[1] for k, dims in conv_dims.items()}
+        self.norm0 = MyModuleDict({k: gvp.LayerNorm(dims, ln_vector_weight) for k, dims in node_dims.items()})
+        self.dropout0 = MyModuleDict({k: gvp.Dropout(drop_rate) for k, dims in node_dims.items()})
+        self.ff_func = MyModuleDict({k: get_feedforward(dims) for k, dims in node_dims.items()})
+        self.norm1 = MyModuleDict({k: gvp.LayerNorm(dims, ln_vector_weight) for k, dims in node_dims.items()})
+        self.dropout1 = MyModuleDict({k: gvp.Dropout(drop_rate) for k, dims in node_dims.items()})
+        if self.update_edge_attr:
+            self.edge_norm0 = MyModuleDict({k: gvp.LayerNorm(dims[2], ln_vector_weight) for k, dims in conv_dims.items()})
+            self.edge_dropout0 = MyModuleDict({k: gvp.Dropout(drop_rate) for k, dims in conv_dims.items()})
+            self.edge_ff = MyModuleDict({k: get_feedforward(dims[2]) for k, dims in conv_dims.items()})
+            self.edge_norm1 = MyModuleDict({k: gvp.LayerNorm(dims[2], ln_vector_weight) for k, dims in conv_dims.items()})
+            self.edge_dropout1 = MyModuleDict({k: gvp.Dropout(drop_rate) for k, dims in conv_dims.items()})
+    def forward(self, x_dict, edge_index_dict, edge_attr_dict, node_mask_dict=None):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`
+        :param edge_index: array of shape [2, n_edges]
+        :param edge_attr: tuple (s, V) of `torch.Tensor`
+        :param node_mask: array of type `bool` to index into the first
+                dim of node embeddings (s, V). If not `None`, only
+                these nodes will be updated.
+        '''
+        dh_dict = self.conv(x_dict, edge_index_dict, edge_attr_dict)
+        if self.update_edge_attr:
+            dh_dict, de_dict = dh_dict
+            for k, edge_attr in edge_attr_dict.items():
+                de = de_dict[k]
+                edge_attr = self.edge_norm0[k](tuple_sum(edge_attr, self.edge_dropout0[k](de)))
+                de = self.edge_ff[k](edge_attr)
+                edge_attr = self.edge_norm1[k](tuple_sum(edge_attr, self.edge_dropout1[k](de)))
+                edge_attr_dict[k] = edge_attr
+        for k, x in x_dict.items():
+            dh = dh_dict[k]
+            node_mask = None if node_mask_dict is None else node_mask_dict[k]
+            if node_mask is not None:
+                x_ = x
+                x, dh = tuple_index(x, node_mask), tuple_index(dh, node_mask)
+            x = self.norm0[k](tuple_sum(x, self.dropout0[k](dh)))
+            dh = self.ff_func[k](x)
+            x = self.norm1[k](tuple_sum(x, self.dropout1[k](dh)))
+            if node_mask is not None:
+                x_[0][node_mask], x_[1][node_mask] = x[0], x[1]
+                x = x_
+            x_dict[k] = x
+        return (x_dict, edge_attr_dict) if self.update_edge_attr else x_dict
+class GVPModel(torch.nn.Module):
+    """
+    GVP-GNN model
+    inspired by: https://github.com/drorlab/gvp-pytorch/blob/main/gvp/models.py
+    and: https://github.com/drorlab/gvp-pytorch/blob/82af6b22eaf8311c15733117b0071408d24ed877/gvp/atom3d.py#L115
+    """
+    def __init__(self,
+                 node_in_dim_ligand, node_in_dim_pocket,
+                 edge_in_dim_ligand, edge_in_dim_pocket, edge_in_dim_interaction,
+                 node_h_dim_ligand, node_h_dim_pocket,
+                 edge_h_dim_ligand, edge_h_dim_pocket, edge_h_dim_interaction,
+                 node_out_dim_ligand=None, node_out_dim_pocket=None,
+                 edge_out_dim_ligand=None, edge_out_dim_pocket=None, edge_out_dim_interaction=None,
+                 num_layers=3, drop_rate=0.1, vector_gate=False, update_edge_attr=False):
+        super(GVPModel, self).__init__()
+        self.update_edge_attr = update_edge_attr
+        self.node_in = nn.ModuleDict({
+            'ligand': GVP(node_in_dim_ligand, node_h_dim_ligand, activations=(None, None), vector_gate=vector_gate),
+            'pocket': GVP(node_in_dim_pocket, node_h_dim_pocket, activations=(None, None), vector_gate=vector_gate),
+        })
+        # self.edge_in = MyModuleDict({
+        #     ('ligand', 'ligand'): GVP(edge_in_dim_ligand, edge_h_dim_ligand, activations=(None, None), vector_gate=vector_gate),
+        #     ('pocket', 'pocket'): GVP(edge_in_dim_pocket, edge_h_dim_pocket, activations=(None, None), vector_gate=vector_gate),
+        #     ('ligand', 'pocket'): GVP(edge_in_dim_interaction, edge_h_dim_interaction, activations=(None, None), vector_gate=vector_gate),
+        #     ('pocket', 'ligand'): GVP(edge_in_dim_interaction, edge_h_dim_interaction, activations=(None, None), vector_gate=vector_gate),
+        # })
+        self.edge_in = MyModuleDict({
+            ('ligand', '', 'ligand'): GVP(edge_in_dim_ligand, edge_h_dim_ligand, activations=(None, None), vector_gate=vector_gate),
+            ('pocket', '', 'pocket'): GVP(edge_in_dim_pocket, edge_h_dim_pocket, activations=(None, None), vector_gate=vector_gate),
+            ('ligand', '', 'pocket'): GVP(edge_in_dim_interaction, edge_h_dim_interaction, activations=(None, None), vector_gate=vector_gate),
+            ('pocket', '', 'ligand'): GVP(edge_in_dim_interaction, edge_h_dim_interaction, activations=(None, None), vector_gate=vector_gate),
+        })
+        # conv_dims = {
+        #     ('ligand', 'ligand'): (node_h_dim_ligand, node_h_dim_ligand, edge_h_dim_ligand),
+        #     ('pocket', 'pocket'): (node_h_dim_pocket, node_h_dim_pocket, edge_h_dim_pocket),
+        #     ('ligand', 'pocket'): (node_h_dim_ligand, node_h_dim_pocket, edge_h_dim_interaction),
+        #     ('pocket', 'ligand'): (node_h_dim_pocket, node_h_dim_ligand, edge_h_dim_interaction),
+        # }
+        conv_dims = {
+            ('ligand', '', 'ligand'): (node_h_dim_ligand, node_h_dim_ligand, edge_h_dim_ligand),
+            ('pocket', '', 'pocket'): (node_h_dim_pocket, node_h_dim_pocket, edge_h_dim_pocket),
+            ('ligand', '', 'pocket'): (node_h_dim_ligand, node_h_dim_pocket, edge_h_dim_interaction, node_h_dim_pocket),
+            ('pocket', '', 'ligand'): (node_h_dim_pocket, node_h_dim_ligand, edge_h_dim_interaction, node_h_dim_ligand),
+        }
+        self.layers = nn.ModuleList(
+            GVPHeteroConvLayer(conv_dims,
+                               drop_rate=drop_rate,
+                               update_edge_attr=self.update_edge_attr,
+                               activations=(F.relu, None),
+                               vector_gate=vector_gate,
+                               ln_vector_weight=True)
+            for _ in range(num_layers))
+        self.node_out = nn.ModuleDict({
+            'ligand': GVP(node_h_dim_ligand, node_out_dim_ligand, activations=(None, None), vector_gate=vector_gate),
+            'pocket': GVP(node_h_dim_pocket, node_out_dim_pocket, activations=(None, None), vector_gate=vector_gate) if node_out_dim_pocket is not None else None,
+        })
+        # self.edge_out = MyModuleDict({
+        #     ('ligand', 'ligand'): GVP(edge_h_dim_ligand, edge_out_dim_ligand, activations=(None, None), vector_gate=vector_gate) if edge_out_dim_ligand is not None else None,
+        #     ('pocket', 'pocket'): GVP(edge_h_dim_pocket, edge_out_dim_pocket, activations=(None, None), vector_gate=vector_gate) if edge_out_dim_pocket is not None else None,
+        #     ('ligand', 'pocket'): GVP(edge_h_dim_interaction, edge_out_dim_interaction, activations=(None, None), vector_gate=vector_gate) if edge_out_dim_interaction is not None else None,
+        #     ('pocket', 'ligand'): GVP(edge_h_dim_interaction, edge_out_dim_interaction, activations=(None, None), vector_gate=vector_gate) if edge_out_dim_interaction is not None else None,
+        # })
+        self.edge_out = MyModuleDict({
+            ('ligand', '', 'ligand'): GVP(edge_h_dim_ligand, edge_out_dim_ligand, activations=(None, None), vector_gate=vector_gate) if edge_out_dim_ligand is not None else None,
+            ('pocket', '', 'pocket'): GVP(edge_h_dim_pocket, edge_out_dim_pocket, activations=(None, None), vector_gate=vector_gate) if edge_out_dim_pocket is not None else None,
+            ('ligand', '', 'pocket'): GVP(edge_h_dim_interaction, edge_out_dim_interaction, activations=(None, None), vector_gate=vector_gate) if edge_out_dim_interaction is not None else None,
+            ('pocket', '', 'ligand'): GVP(edge_h_dim_interaction, edge_out_dim_interaction, activations=(None, None), vector_gate=vector_gate) if edge_out_dim_interaction is not None else None,
+        })
+    def forward(self, node_attr, batch_mask, edge_index, edge_attr):
+        # to hidden dimension
+        for k in node_attr.keys():
+            node_attr[k] = self.node_in[k](node_attr[k])
+        for k in edge_attr.keys():
+            edge_attr[k] = self.edge_in[k](edge_attr[k])
+        # convolutions
+        for layer in self.layers:
+            out = layer(node_attr, edge_index, edge_attr)
+            if self.update_edge_attr:
+                node_attr, edge_attr = out
+            else:
+                node_attr = out
+        # to output dimension
+        for k in node_attr.keys():
+            node_attr[k] = self.node_out[k](node_attr[k]) \
+                if self.node_out[k] is not None else None
+        if self.update_edge_attr:
+            for k in edge_attr.keys():
+                if self.edge_out[k] is not None:
+                    edge_attr[k] = self.edge_out[k](edge_attr[k])
+        return node_attr, edge_attr
+class DynamicsHetero(DynamicsBase):
+    def __init__(self, atom_nf, residue_nf, bond_dict, pocket_bond_dict,
+                 condition_time=True,
+                 num_rbf_time=None,
+                 model='gvp',
+                 model_params=None,
+                 edge_cutoff_ligand=None,
+                 edge_cutoff_pocket=None,
+                 edge_cutoff_interaction=None,
+                 predict_angles=False,
+                 predict_frames=False,
+                 add_cycle_counts=False,
+                 add_spectral_feat=False,
+                 add_nma_feat=False,
+                 reflection_equiv=False,
+                 d_max=15.0,
+                 num_rbf_dist=16,
+                 self_conditioning=False,
+                 augment_residue_sc=False,
+                 augment_ligand_sc=False,
+                 add_chi_as_feature=False,
+                 angle_act_fn=False,
+                 add_all_atom_diff=False,
+                 predict_confidence=False):
+        super().__init__(
+            predict_angles=predict_angles,
+            predict_frames=predict_frames,
+            add_cycle_counts=add_cycle_counts,
+            add_spectral_feat=add_spectral_feat,
+            self_conditioning=self_conditioning,
+            augment_residue_sc=augment_residue_sc,
+            augment_ligand_sc=augment_ligand_sc
+        )
+        self.model = model
+        self.edge_cutoff_l = edge_cutoff_ligand
+        self.edge_cutoff_p = edge_cutoff_pocket
+        self.edge_cutoff_i = edge_cutoff_interaction
+        self.bond_dict = bond_dict
+        self.pocket_bond_dict = pocket_bond_dict
+        self.bond_nf = len(bond_dict)
+        self.pocket_bond_nf = len(pocket_bond_dict)
+        # self.edge_dim = edge_dim
+        self.add_nma_feat = add_nma_feat
+        self.add_chi_as_feature = add_chi_as_feature
+        self.add_all_atom_diff = add_all_atom_diff
+        self.condition_time = condition_time
+        self.predict_confidence = predict_confidence
+        # edge encoding params
+        self.reflection_equiv = reflection_equiv
+        self.d_max = d_max
+        self.num_rbf = num_rbf_dist
+        # Output dimensions dimensions, always tuple (scalar, vector)
+        _atom_out = (atom_nf[0], 1) if isinstance(atom_nf, Iterable) else (atom_nf, 1)
+        _residue_out = (0, 0)
+        if self.predict_confidence:
+            _atom_out = tuple_sum(_atom_out, (1, 0))
+        if self.predict_angles:
+            _residue_out = tuple_sum(_residue_out, (5, 0))
+        if self.predict_frames:
+            _residue_out = tuple_sum(_residue_out, (3, 1))
+        # Input dimensions dimensions, always tuple (scalar, vector)
+        assert isinstance(atom_nf, int), "expected: element onehot"
+        _atom_in = (atom_nf, 0)
+        assert isinstance(residue_nf, Iterable), "expected: (AA-onehot, vectors to atoms)"
+        _residue_in = tuple(residue_nf)
+        _residue_atom_dim = residue_nf[1]
+        if self.add_cycle_counts:
+            _atom_in = tuple_sum(_atom_in, (3, 0))
+        if self.add_spectral_feat:
+            _atom_in = tuple_sum(_atom_in, (5, 0))
+        if self.add_nma_feat:
+            _residue_in = tuple_sum(_residue_in, (0, 5))
+        if self.add_chi_as_feature:
+            _residue_in = tuple_sum(_residue_in, (5, 0))
+        if self.condition_time:
+            self.embed_time = num_rbf_time is not None
+            self.time_dim = num_rbf_time if self.embed_time else 1
+            _atom_in = tuple_sum(_atom_in, (self.time_dim, 0))
+            _residue_in = tuple_sum(_residue_in, (self.time_dim, 0))
+        else:
+            print('Warning: dynamics model is NOT conditioned on time.')
+        if self.self_conditioning:
+            _atom_in = tuple_sum(_atom_in, _atom_out)
+            _residue_in = tuple_sum(_residue_in, _residue_out)
+            if self.augment_ligand_sc:
+                _atom_in = tuple_sum(_atom_in, (0, 1))
+            if self.augment_residue_sc:
+                assert self.predict_angles
+                _residue_in = tuple_sum(_residue_in, (0, _residue_atom_dim))
+        # Edge output dimensions, always tuple (scalar, vector)
+        _edge_ligand_out = (self.bond_nf, 0)
+        _edge_ligand_before_symmetrization = (model_params.edge_h_dim[0], 0)
+        # Edge input dimensions dimensions, always tuple (scalar, vector)
+        _edge_ligand_in = (self.bond_nf + self.num_rbf, 1 if self.reflection_equiv else 2)
+        _edge_ligand_in = tuple_sum(_edge_ligand_in, _atom_in)  # src node
+        _edge_ligand_in = tuple_sum(_edge_ligand_in, _atom_in)  # dst node
+        if self_conditioning:
+            _edge_ligand_in = tuple_sum(_edge_ligand_in, _edge_ligand_out)
+        _n_dist_residue = _residue_atom_dim ** 2 if self.add_all_atom_diff else 1
+        _edge_pocket_in = (_n_dist_residue * self.num_rbf + self.pocket_bond_nf, _n_dist_residue)
+        _edge_pocket_in = tuple_sum(_edge_pocket_in, _residue_in)  # src node
+        _edge_pocket_in = tuple_sum(_edge_pocket_in, _residue_in)  # dst node
+        _n_dist_interaction = _residue_atom_dim if self.add_all_atom_diff else 1
+        _edge_interaction_in = (_n_dist_interaction * self.num_rbf, _n_dist_interaction)
+        _edge_interaction_in = tuple_sum(_edge_interaction_in, _atom_in)  # atom node
+        _edge_interaction_in = tuple_sum(_edge_interaction_in, _residue_in)  # residue node
+        # Embeddings for newly added edges
+        _ligand_nobond_nf = self.bond_nf + _edge_ligand_out[0] if self.self_conditioning else self.bond_nf
+        self.ligand_nobond_emb = nn.Parameter(torch.zeros(_ligand_nobond_nf), requires_grad=True)
+        self.pocket_nobond_emb = nn.Parameter(torch.zeros(self.pocket_bond_nf), requires_grad=True)
+        # for access in self-conditioning
+        self.atom_out_dim = _atom_out
+        self.residue_out_dim = _residue_out
+        self.edge_out_dim = _edge_ligand_out
+        if model == 'gvp':
+            self.net = GVPModel(
+                node_in_dim_ligand=_atom_in,
+                node_in_dim_pocket=_residue_in,
+                edge_in_dim_ligand=_edge_ligand_in,
+                edge_in_dim_pocket=_edge_pocket_in,
+                edge_in_dim_interaction=_edge_interaction_in,
+                node_h_dim_ligand=model_params.node_h_dim,
+                node_h_dim_pocket=model_params.node_h_dim,
+                edge_h_dim_ligand=model_params.edge_h_dim,
+                edge_h_dim_pocket=model_params.edge_h_dim,
+                edge_h_dim_interaction=model_params.edge_h_dim,
+                node_out_dim_ligand=_atom_out,
+                node_out_dim_pocket=_residue_out,
+                edge_out_dim_ligand=_edge_ligand_before_symmetrization,
+                edge_out_dim_pocket=None,
+                edge_out_dim_interaction=None,
+                num_layers=model_params.n_layers,
+                drop_rate=model_params.dropout,
+                vector_gate=model_params.vector_gate,
+                update_edge_attr=True
+            )
+        else:
+            raise NotImplementedError(f"{model} is not available")
+        assert _edge_ligand_out[1] == 0
+        assert _edge_ligand_before_symmetrization[1] == 0
+        self.edge_decoder = nn.Sequential(
+            nn.Linear(_edge_ligand_before_symmetrization[0], _edge_ligand_before_symmetrization[0]),
+            torch.nn.SiLU(),
+            nn.Linear(_edge_ligand_before_symmetrization[0], _edge_ligand_out[0])
+        )
+        if angle_act_fn is None:
+            self.angle_act_fn = None
+        elif angle_act_fn == 'tanh':
+            self.angle_act_fn = lambda x: np.pi * F.tanh(x)
+        else:
+            raise NotImplementedError(f"Angle activation {angle_act_fn} not available")
+    def _forward(self, x_atoms, h_atoms, mask_atoms, pocket, t, bonds_ligand=None,
+                 h_atoms_sc=None, e_atoms_sc=None, h_residues_sc=None):
+        """
+        :param x_atoms:
+        :param h_atoms:
+        :param mask_atoms:
+        :param pocket: must contain keys: 'x', 'one_hot', 'mask', 'bonds' and 'bond_one_hot'
+        :param t:
+        :param bonds_ligand: tuple - bond indices (2, n_bonds) & bond types (n_bonds, bond_nf)
+        :param h_atoms_sc: additional node feature for self-conditioning, (s, V)
+        :param e_atoms_sc: additional edge feature for self-conditioning, only scalar
+        :param h_residues_sc: additional node feature for self-conditioning, tensor or tuple
+        :return:
+        """
+        x_residues, h_residues, mask_residues = pocket['x'], pocket['one_hot'], pocket['mask']
+        if 'bonds' in pocket:
+            bonds_pocket = (pocket['bonds'], pocket['bond_one_hot'])
+        else:
+            bonds_pocket = None
+        if self.add_chi_as_feature:
+            h_residues = torch.cat([h_residues, pocket['chi'][:, :5]], dim=-1)
+        if 'v' in pocket:
+            v_residues = pocket['v']
+            if self.add_nma_feat:
+                v_residues = torch.cat([v_residues, pocket['nma_vec']], dim=1)
+            h_residues = (h_residues, v_residues)
+        # NOTE: 'bond' denotes one-directional edges and 'edge' means bi-directional
+        # get graph edges and edge attributes
+        if bonds_ligand is not None:
+            ligand_bond_indices = bonds_ligand[0]
+            # make sure messages are passed both ways
+            ligand_edge_indices = torch.cat(
+                [bonds_ligand[0], bonds_ligand[0].flip(dims=[0])], dim=1)
+            ligand_edge_types = torch.cat([bonds_ligand[1], bonds_ligand[1]], dim=0)
+            if e_atoms_sc is not None:
+                e_atoms_sc = torch.cat([e_atoms_sc, e_atoms_sc], dim=0)
+            # add auxiliary features to ligand nodes
+            extra_features = self.compute_extra_features(
+                mask_atoms, ligand_edge_indices, ligand_edge_types.argmax(-1))
+            h_atoms = torch.cat([h_atoms, extra_features], dim=-1)
+        if bonds_pocket is not None:
+            # make sure messages are passed both ways
+            pocket_edge_indices = torch.cat(
+                [bonds_pocket[0], bonds_pocket[0].flip(dims=[0])], dim=1)
+            pocket_edge_types = torch.cat([bonds_pocket[1], bonds_pocket[1]], dim=0)
+        # Self-conditioning
+        if h_atoms_sc is not None:
+            h_atoms = (torch.cat([h_atoms, h_atoms_sc[0]], dim=-1), h_atoms_sc[1])
+        if e_atoms_sc is not None:
+            ligand_edge_types = torch.cat([ligand_edge_types, e_atoms_sc], dim=-1)
+        if h_residues_sc is not None:
+            # if self.augment_residue_sc:
+            if isinstance(h_residues_sc, tuple):
+                h_residues = (torch.cat([h_residues[0], h_residues_sc[0]], dim=-1),
+                              torch.cat([h_residues[1], h_residues_sc[1]], dim=1))
+            else:
+                h_residues = (torch.cat([h_residues[0], h_residues_sc], dim=-1),
+                              h_residues[1])
+        if self.condition_time:
+            if self.embed_time:
+                t = _rbf(t.squeeze(-1), D_min=0.0, D_max=1.0, D_count=self.time_dim, device=t.device)
+            if isinstance(h_atoms, tuple) :
+                h_atoms = (torch.cat([h_atoms[0], t[mask_atoms]], dim=1), h_atoms[1])
+            else:
+                h_atoms = torch.cat([h_atoms, t[mask_atoms]], dim=1)
+            h_residues = (torch.cat([h_residues[0], t[mask_residues]], dim=1), h_residues[1])
+        empty_pocket = (len(pocket['x']) == 0)
+        # Process edges and encode in shared feature space
+        edge_index_dict, edge_attr_dict = self.get_edges(
+            x_atoms, h_atoms, mask_atoms, ligand_edge_indices, ligand_edge_types,
+            x_residues, h_residues, mask_residues, pocket['v'], pocket_edge_indices, pocket_edge_types,
+            empty_pocket=empty_pocket
+        )
+        if not empty_pocket:
+            node_attr_dict = {
+                'ligand': h_atoms,
+                'pocket': h_residues,
+            }
+            batch_mask_dict = {
+                'ligand': mask_atoms,
+                'pocket': mask_residues,
+            }
+        else:
+            node_attr_dict = {'ligand': h_atoms}
+            batch_mask_dict = {'ligand': mask_atoms}
+        if self.model == 'gvp' or self.model == 'gvp_transformer':
+            out_node_attr, out_edge_attr = self.net(
+                node_attr_dict, batch_mask_dict, edge_index_dict, edge_attr_dict)
+        else:
+            raise NotImplementedError(f"Wrong model ({self.model})")
+        h_final_atoms = out_node_attr['ligand'][0]
+        vel = out_node_attr['ligand'][1].squeeze(-2)
+        if torch.any(torch.isnan(vel)) or torch.any(torch.isnan(h_final_atoms)):
+            if self.training:
+                vel[torch.isnan(vel)] = 0.0
+                h_final_atoms[torch.isnan(h_final_atoms)] = 0.0
+            else:
+                raise ValueError("NaN detected in network output")
+        # predict edge type
+        edge_final = out_edge_attr[('ligand', '', 'ligand')]
+        edges = edge_index_dict[('ligand', '', 'ligand')]
+        # Symmetrize
+        edge_logits = torch.zeros(
+            (len(mask_atoms), len(mask_atoms), edge_final.size(-1)),
+            device=mask_atoms.device)
+        edge_logits[edges[0], edges[1]] = edge_final
+        edge_logits = (edge_logits + edge_logits.transpose(0, 1)) * 0.5
+        # return upper triangular elements only (matching the input)
+        edge_logits = edge_logits[ligand_bond_indices[0], ligand_bond_indices[1]]
+        # assert (edge_logits == 0).sum() == 0
+        edge_final_atoms = self.edge_decoder(edge_logits)
+        pred_ligand = {'vel': vel, 'logits_e': edge_final_atoms}
+        if self.predict_confidence:
+            pred_ligand['logits_h'] = h_final_atoms[:, :-1]
+            pred_ligand['uncertainty_vel'] = F.softplus(h_final_atoms[:, -1])
+        else:
+            pred_ligand['logits_h'] = h_final_atoms
+        pred_residues = {}
+        # Predict torsion angles
+        if self.predict_angles and self.predict_frames:
+            residue_s, residue_v = out_node_attr['pocket']
+            pred_residues['chi'] = residue_s[:, :5]
+            pred_residues['rot'] = residue_s[:, 5:]
+            pred_residues['trans'] = residue_v.squeeze(1)
+        elif self.predict_frames:
+            pred_residues['rot'], pred_residues['trans'] = out_node_attr['pocket']
+            pred_residues['trans'] = pred_residues['trans'].squeeze(1)
+        elif self.predict_angles:
+            pred_residues['chi'] = out_node_attr['pocket']
+        if self.angle_act_fn is not None and 'chi' in pred_residues:
+            pred_residues['chi'] = self.angle_act_fn(pred_residues['chi'])
+        return pred_ligand, pred_residues
+    def get_edges(self, x_ligand, h_ligand, batch_mask_ligand, edges_ligand, edge_feat_ligand,
+                  x_pocket, h_pocket, batch_mask_pocket, atom_vectors_pocket, edges_pocket, edge_feat_pocket,
+                  self_edges=False, empty_pocket=False):
+        # Adjacency matrix
+        adj_ligand = batch_mask_ligand[:, None] == batch_mask_ligand[None, :]
+        adj_pocket = batch_mask_pocket[:, None] == batch_mask_pocket[None, :]
+        adj_cross = batch_mask_ligand[:, None] == batch_mask_pocket[None, :]
+        if self.edge_cutoff_l is not None:
+            adj_ligand = adj_ligand & (torch.cdist(x_ligand, x_ligand) <= self.edge_cutoff_l)
+            # Add missing bonds if they got removed
+            adj_ligand[edges_ligand[0], edges_ligand[1]] = True
+            if not self_edges:
+                adj_ligand = adj_ligand ^ torch.eye(*adj_ligand.size(), out=torch.empty_like(adj_ligand))
+        if self.edge_cutoff_p is not None and not empty_pocket:
+            adj_pocket = adj_pocket & (torch.cdist(x_pocket, x_pocket) <= self.edge_cutoff_p)
+            # Add missing bonds if they got removed
+            adj_pocket[edges_pocket[0], edges_pocket[1]] = True
+            if not self_edges:
+                adj_pocket = adj_pocket ^ torch.eye(*adj_pocket.size(), out=torch.empty_like(adj_pocket))
+        if self.edge_cutoff_i is not None and not empty_pocket:
+            adj_cross = adj_cross & (torch.cdist(x_ligand, x_pocket) <= self.edge_cutoff_i)
+        # ligand-ligand edge features
+        edges_ligand_updated = torch.stack(torch.where(adj_ligand), dim=0)
+        feat_ligand = self.ligand_nobond_emb.repeat(*adj_ligand.shape, 1)
+        feat_ligand[edges_ligand[0], edges_ligand[1]] = edge_feat_ligand
+        feat_ligand = feat_ligand[edges_ligand_updated[0], edges_ligand_updated[1]]
+        feat_ligand = self.ligand_edge_features(h_ligand, x_ligand, edges_ligand_updated, batch_mask_ligand, edge_attr=feat_ligand)
+        if not empty_pocket:
+            # residue-residue edge features
+            edges_pocket_updated = torch.stack(torch.where(adj_pocket), dim=0)
+            feat_pocket = self.pocket_nobond_emb.repeat(*adj_pocket.shape, 1)
+            feat_pocket[edges_pocket[0], edges_pocket[1]] = edge_feat_pocket
+            feat_pocket = feat_pocket[edges_pocket_updated[0], edges_pocket_updated[1]]
+            feat_pocket = self.pocket_edge_features(h_pocket, x_pocket, atom_vectors_pocket, edges_pocket_updated, edge_attr=feat_pocket)
+            # ligand-residue edge features
+            edges_cross = torch.stack(torch.where(adj_cross), dim=0)
+            feat_cross = self.cross_edge_features(h_ligand, x_ligand, h_pocket, x_pocket, atom_vectors_pocket, edges_cross)
+            edge_index = {
+                ('ligand', '', 'ligand'): edges_ligand_updated,
+                ('pocket', '', 'pocket'): edges_pocket_updated,
+                ('ligand', '', 'pocket'): edges_cross,
+                ('pocket', '', 'ligand'): edges_cross.flip(dims=[0]),
+            }
+            edge_attr = {
+                ('ligand', '', 'ligand'): feat_ligand,
+                ('pocket', '', 'pocket'): feat_pocket,
+                ('ligand', '', 'pocket'): feat_cross,
+                ('pocket', '', 'ligand'): feat_cross,
+            }
+        else:
+            edge_index = {('ligand', '', 'ligand'): edges_ligand_updated}
+            edge_attr = {('ligand', '', 'ligand'): feat_ligand}
+        return edge_index, edge_attr
+    def ligand_edge_features(self, h, x, edge_index, batch_mask=None, edge_attr=None):
+        """
+        :param h: (s, V)
+        :param x:
+        :param edge_index:
+        :param batch_mask:
+        :param edge_attr:
+        :return: scalar and vector-valued edge features
+        """
+        row, col = edge_index
+        coord_diff = x[row] - x[col]
+        dist = coord_diff.norm(dim=-1)
+        rbf = _rbf(dist, D_max=self.d_max, D_count=self.num_rbf,
+                   device=x.device)
+        if isinstance(h, tuple):
+            edge_s = torch.cat([h[0][row], h[0][col], rbf], dim=1)
+            edge_v = torch.cat([h[1][row], h[1][col], _normalize(coord_diff).unsqueeze(-2)], dim=1)
+        else:
+            edge_s = torch.cat([h[row], h[col], rbf], dim=1)
+            edge_v = _normalize(coord_diff).unsqueeze(-2)
+        # edge_s = rbf
+        # edge_v = _normalize(coord_diff).unsqueeze(-2)
+        if edge_attr is not None:
+            edge_s = torch.cat([edge_s, edge_attr], dim=1)
+        # self.reflection_equiv: bool, use reflection-sensitive feature based on
+        #                        the cross product if False
+        if not self.reflection_equiv:
+            mean = scatter_mean(x, batch_mask, dim=0,
+                                dim_size=batch_mask.max() + 1)
+            row, col = edge_index
+            cross = torch.cross(x[row] - mean[batch_mask[row]],
+                                x[col] - mean[batch_mask[col]], dim=1)
+            cross = _normalize(cross).unsqueeze(-2)
+            edge_v = torch.cat([edge_v, cross], dim=-2)
+        return torch.nan_to_num(edge_s), torch.nan_to_num(edge_v)
+    def pocket_edge_features(self, h, x, v, edge_index, edge_attr=None):
+        """
+        :param h: (s, V)
+        :param x:
+        :param v:
+        :param edge_index:
+        :param edge_attr:
+        :return: scalar and vector-valued edge features
+        """
+        row, col = edge_index
+        if self.add_all_atom_diff:
+            all_coord = v + x.unsqueeze(1)  # (nR, nA, 3)
+            coord_diff = all_coord[row, :, None, :] - all_coord[col, None, :, :]  # (nB, nA, nA, 3)
+            coord_diff = coord_diff.flatten(1, 2)
+            dist = coord_diff.norm(dim=-1)  # (nB, nA^2)
+            rbf = _rbf(dist, D_max=self.d_max, D_count=self.num_rbf, device=x.device)  # (nB, nA^2, rdb_dim)
+            rbf = rbf.flatten(1, 2)
+            coord_diff = _normalize(coord_diff)
+        else:
+            coord_diff = x[row] - x[col]
+            dist = coord_diff.norm(dim=-1)
+            rbf = _rbf(dist, D_max=self.d_max, D_count=self.num_rbf, device=x.device)
+            coord_diff = _normalize(coord_diff).unsqueeze(-2)
+        edge_s = torch.cat([h[0][row], h[0][col], rbf], dim=1)
+        edge_v = torch.cat([h[1][row], h[1][col], coord_diff], dim=1)
+        # edge_s = rbf
+        # edge_v = coord_diff
+        if edge_attr is not None:
+            edge_s = torch.cat([edge_s, edge_attr], dim=1)
+        return torch.nan_to_num(edge_s), torch.nan_to_num(edge_v)
+    def cross_edge_features(self, h_ligand, x_ligand, h_pocket, x_pocket, v_pocket, edge_index):
+        """
+        :param h_ligand: (s, V)
+        :param x_ligand:
+        :param h_pocket: (s, V)
+        :param x_pocket:
+        :param v_pocket:
+        :param edge_index: first row indexes into the ligand tensors, second row into the pocket tensors
+        :return: scalar and vector-valued edge features
+        """
+        ligand_idx, pocket_idx = edge_index
+        if self.add_all_atom_diff:
+            all_coord_pocket = v_pocket + x_pocket.unsqueeze(1)  # (nR, nA, 3)
+            coord_diff = x_ligand[ligand_idx, None, :] - all_coord_pocket[pocket_idx]  # (nB, nA, 3)
+            dist = coord_diff.norm(dim=-1)  # (nB, nA)
+            rbf = _rbf(dist, D_max=self.d_max, D_count=self.num_rbf, device=x_ligand.device)  # (nB, nA, rdb_dim)
+            rbf = rbf.flatten(1, 2)
+            coord_diff = _normalize(coord_diff)
+        else:
+            coord_diff = x_ligand[ligand_idx] - x_pocket[pocket_idx]
+            dist = coord_diff.norm(dim=-1)  # (nB, nA)
+            rbf = _rbf(dist, D_max=self.d_max, D_count=self.num_rbf, device=x_ligand.device)
+            coord_diff = _normalize(coord_diff).unsqueeze(-2)
+        if isinstance(h_ligand, tuple):
+            edge_s = torch.cat([h_ligand[0][ligand_idx], h_pocket[0][pocket_idx], rbf], dim=1)
+            edge_v = torch.cat([h_ligand[1][ligand_idx], h_pocket[1][pocket_idx], coord_diff], dim=1)
+        else:
+            edge_s = torch.cat([h_ligand[ligand_idx], h_pocket[0][pocket_idx], rbf], dim=1)
+            edge_v = torch.cat([h_pocket[1][pocket_idx], coord_diff], dim=1)
+        # edge_s = rbf
+        # edge_v = coord_diff
+        return torch.nan_to_num(edge_s), torch.nan_to_num(edge_v)

src/model/flows.py ADDED Viewed

	@@ -0,0 +1,448 @@

+from abc import ABC
+from abc import abstractmethod
+import math
+import torch
+from torch_scatter import scatter_mean, scatter_add
+import src.data.so3_utils as so3
+class ICFM(ABC):
+    """
+    Abstract base class for all Independent-coupling CFM classes.
+    Defines a common interface.
+    Notation:
+    - zt is the intermediate representation at time step t \in [0, 1]
+    - zs is the noised representation at time step s < t
+    # TODO: add interpolation schedule (not necessrily linear)
+    """
+    def __init__(self, sigma):
+        self.sigma = sigma
+    @abstractmethod
+    def sample_zt(self, z0, z1, t, *args, **kwargs):
+        """ TODO. """
+        pass
+    @abstractmethod
+    def sample_zt_given_zs(self, *args, **kwargs):
+        """ Perform update, typically using an explicit Euler step. """
+        pass
+    @abstractmethod
+    def sample_z0(self, *args, **kwargs):
+        """ Prior. """
+        pass
+    @abstractmethod
+    def compute_loss(self, pred, z0, z1, *args, **kwargs):
+        """ Compute loss per sample. """
+        pass
+class CoordICFM(ICFM):
+    def __init__(self, sigma):
+        self.dim = 3
+        self.scale = 2.7
+        super().__init__(sigma)
+    def sample_zt(self, z0, z1, t, batch_mask):
+        zt = t[batch_mask] * z1 + (1 - t)[batch_mask] * z0
+        # zt = self.sigma * z0 + t[batch_mask] * z1 + (1 - t)[batch_mask] * z0  # TODO: do we have to compute Psi?
+        return zt
+    def sample_zt_given_zs(self, zs, pred, s, t, batch_mask):
+        """ Perform an explicit Euler step. """
+        step_size = t - s
+        zt = zs + step_size[batch_mask] * self.scale * pred
+        return zt
+    def sample_z0(self, com, batch_mask):
+        """ Prior. """
+        z0 = torch.randn((len(batch_mask), self.dim), device=batch_mask.device)
+        # Move center of mass
+        z0 = z0 + com[batch_mask]
+        return z0
+    def reduce_loss(self, loss, batch_mask, reduce):
+        assert reduce in {'mean', 'sum', 'none'}
+        if reduce == 'mean':
+            loss = scatter_mean(loss / self.dim, batch_mask, dim=0)
+        elif reduce == 'sum':
+            loss = scatter_add(loss, batch_mask, dim=0)
+        return loss
+    def compute_loss(self, pred, z0, z1, t, batch_mask, reduce='mean'):
+        """ Compute loss per sample. """
+        loss = torch.sum((pred - (z1 - z0) / self.scale) ** 2, dim=-1)
+        return self.reduce_loss(loss, batch_mask, reduce)
+    def get_z1_given_zt_and_pred(self, zt, pred, z0, t, batch_mask):
+        """ Make a best guess on the final state z1 given the current state and
+        the network prediction. """
+        # z1 = z0 + pred
+        z1 = zt + (1 - t)[batch_mask] * pred
+        return z1
+class TorusICFM(ICFM):
+    """
+    Following:
+    Chen, Ricky TQ, and Yaron Lipman.
+    "Riemannian flow matching on general geometries."
+    arXiv preprint arXiv:2302.03660 (2023).
+    """
+    def __init__(self, sigma, dim, scheduler_args=None):
+        super().__init__(sigma)
+        self.dim = dim
+        # Scheduler that determines the rate at which the geodesic distance decreases
+        scheduler_args = scheduler_args or {}
+        scheduler_args["type"] = scheduler_args.get("type", "linear")  # default
+        scheduler_args["learn_scaled"] = scheduler_args.get("learn_scaled", False)  # default
+        # linear scheduler: kappa(t) = 1-t (default)
+        if scheduler_args["type"] == "linear":
+            # equivalent to: 1 - kappa(t)
+            self.flow_scaling = lambda t: t
+            # equivalent to: -1 * d/dt kappa(t)
+            self.velocity_scaling = lambda t: torch.ones_like(t)
+        # exponential scheduler: kappa(t) = exp(-c*t)
+        elif scheduler_args["type"] == "exponential":
+            self.c = scheduler_args["c"]
+            assert self.c > 0
+            # equivalent to: 1 - kappa(t)
+            self.flow_scaling = lambda t: 1 - torch.exp(-self.c * t)
+            # equivalent to: -1 * d/dt kappa(t)
+            self.velocity_scaling = lambda t: self.c * torch.exp(-self.c * t)
+        # polynomial scheduler: kappa(t) = (1-t)^k
+        elif scheduler_args["type"] == "polynomial":
+            self.k = scheduler_args["k"]
+            assert self.k > 0
+            # equivalent to: 1 - kappa(t)
+            self.flow_scaling = lambda t: 1 - (1 - t)**self.k
+            # equivalent to: -1 * d/dt kappa(t)
+            self.velocity_scaling = lambda t: self.k * (1 - t)**(self.k - 1)
+        else:
+            raise NotImplementedError(f"Scheduler {scheduler_args['type']} not implemented.")
+        kappa_interval = self.flow_scaling(torch.tensor([0.0, 1.0]))
+        if kappa_interval[0] != 0.0 or kappa_interval[1] != 1.0:
+            print(f"Scheduler should satisfy kappa(0)=1 and kappa(1)=0. Found "
+                  f"interval {kappa_interval.tolist()} instead.")
+        # determines whether the scaled vector field is learned or the scheduler
+        # is post-multiplied
+        self.learn_scaled = scheduler_args["learn_scaled"]
+    @staticmethod
+    def wrap(angle):
+        """ Maps angles to range [-\pi, \pi). """
+        return ((angle + math.pi) % (2 * math.pi)) - math.pi
+    def exponential_map(self, x, u):
+        """
+        :param x: point on the manifold
+        :param u: point on the tangent space
+        """
+        return self.wrap(x + u)
+    @staticmethod
+    def logarithm_map(x, y):
+        """
+        :param x, y: points on the manifold
+        """
+        return torch.atan2(torch.sin(y - x), torch.cos(y - x))
+    def sample_zt(self, z0, z1, t, batch_mask):
+        """ expressed in terms of exponential and logarithm maps """
+        # apply logarithm map
+        # zt_tangent = t[batch_mask] * self.logarithm_map(z0, z1)
+        zt_tangent = self.flow_scaling(t)[batch_mask] * self.logarithm_map(z0, z1)
+        # apply exponential map
+        return self.exponential_map(z0, zt_tangent)
+    def get_z1_given_zt_and_pred(self, zt, pred, z0, t, batch_mask):
+        """ Make a best guess on the final state z1 given the current state and
+        the network prediction. """
+        # estimate z1_tangent based on zt and pred only
+        if self.learn_scaled:
+            pred = pred / torch.clamp(self.velocity_scaling(t), min=1e-3)[batch_mask]
+        z1_tangent = (1 - t)[batch_mask] * pred
+        # exponential map
+        return self.exponential_map(zt, z1_tangent)
+    def sample_zt_given_zs(self, zs, pred, s, t, batch_mask):
+        """ Perform update, typically using an explicit Euler step. """
+        step_size = t - s
+        zt_tangent = step_size[batch_mask] * pred
+        if not self.learn_scaled:
+            zt_tangent = self.velocity_scaling(t)[batch_mask] * zt_tangent
+        # exponential map
+        return self.exponential_map(zs, zt_tangent)
+    def sample_z0(self, batch_mask):
+        """ Prior. """
+        # Uniform distribution
+        z0 = torch.rand((len(batch_mask), self.dim), device=batch_mask.device)
+        return 2 * math.pi * z0 - math.pi
+    def compute_loss(self, pred, z0, z1, zt, t, batch_mask, reduce='mean'):
+        """ Compute loss per sample. """
+        assert reduce in {'mean', 'sum', 'none'}
+        mask = ~torch.isnan(z1)
+        z1 = torch.nan_to_num(z1, nan=0.0)
+        zt_dot = self.logarithm_map(z0, z1)
+        if self.learn_scaled:
+            # NOTE: potentially requires output magnitude to vary substantially
+            zt_dot = self.velocity_scaling(t)[batch_mask] * zt_dot
+        loss = mask * (pred - zt_dot) ** 2
+        loss = torch.sum(loss, dim=-1)
+        if reduce == 'mean':
+            denom = mask.sum(dim=-1) + 1e-6
+            loss = scatter_mean(loss / denom, batch_mask, dim=0)
+        elif reduce == 'sum':
+            loss = scatter_add(loss, batch_mask, dim=0)
+        return loss
+class SO3ICFM(ICFM):
+    """
+    All rotations are assumed to be in axis-angle format.
+    Mostly following descriptions from the FoldFlow paper:
+    https://openreview.net/forum?id=kJFIH23hXb
+    See also:
+    https://geomstats.github.io/_modules/geomstats/geometry/special_orthogonal.html#SpecialOrthogonal
+    https://geomstats.github.io/_modules/geomstats/geometry/lie_group.html#LieGroup
+    """
+    def __init__(self, sigma):
+        super().__init__(sigma)
+    def exponential_map(self, base, tangent):
+        """
+        Args:
+            base: base point (rotation vector) on the manifold
+            tangent: point in tangent space at identity
+        Returns:
+            rotation vector on the manifold
+        """
+        # return so3.exp_not_from_identity(tangent, base_point=base)
+        return so3.compose_rotations(base, so3.exp(tangent))
+    def logarithm_map(self, base, r):
+        """
+        Args:
+            base: base point (rotation vector) on the manifold
+            r: rotation vector on the manifold
+        Return:
+            point in tangent space at identity
+        """
+        # return so3.log_not_from_identity(r, base_point=base)
+        return so3.log(so3.compose_rotations(-base, r))
+    def sample_zt(self, z0, z1, t, batch_mask):
+        """
+        Expressed in terms of exponential and logarithm maps.
+        Corresponds to SLERP interpolation: R(t) = R1 exp( t * log(R1^T R2) )
+        (see https://lucaballan.altervista.org/pdfs/IK.pdf, slide 16)
+        """
+        # apply logarithm map
+        zt_tangent = t[batch_mask] * self.logarithm_map(z0, z1)
+        # apply exponential map
+        return self.exponential_map(z0, zt_tangent)
+    def get_z1_given_zt_and_pred(self, zt, pred, z0, t, batch_mask):
+        """ Make a best guess on the final state z1 given the current state and
+        the network prediction. """
+        # estimate z1_tangent based on zt and pred only
+        z1_tangent = (1 - t)[batch_mask] * pred
+        # exponential map
+        return self.exponential_map(zt, z1_tangent)
+    def sample_zt_given_zs(self, zs, pred, s, t, batch_mask):
+        """ Perform update, typically using an explicit Euler step. """
+        # # parallel transport vector field to lie algebra so3 (at identity)
+        # # (FoldFlow paper, Algorithm 3, line 8)
+        # # TODO: is this correct? is it necessary?
+        # pred = so3.compose(so3.inverse(zs), pred)
+        step_size = t - s
+        zt_tangent = step_size[batch_mask] * pred
+        # exponential map
+        return self.exponential_map(zs, zt_tangent)
+    def sample_z0(self, batch_mask):
+        """ Prior. """
+        return so3.random_uniform(n_samples=len(batch_mask), device=batch_mask.device)
+    @staticmethod
+    def d_R_squared_SO3(rot_vec_1, rot_vec_2):
+        """
+        Squared Riemannian metric on SO(3).
+        Defined as d(R1, R2) = sqrt(0.5) ||log(R1^T R2)||_F
+        where R1, R2 are rotation matrices.
+        The following is equivalent if the difference between the rotations is
+        expressed as a rotation vector \omega_diff:
+        d(r1, r2) = ||\omega_diff||_2
+        -----
+        With the definition of the Frobenius matrix norm ||A||_F^2 = trace(A^H A):
+        d^2(R1, R2) = 1/2 ||log(R1^T R2)||_F^2
+                    = 1/2 || hat(R_d) ||_F^2
+                    = 1/2 tr( hat(R_d)^T hat(R_d) )
+                    = 1/2 * 2 * ||\omega||_2^2
+        """
+        # rot_mat_1 = so3.matrix_from_rotation_vector(rot_vec_1)
+        # rot_mat_2 = so3.matrix_from_rotation_vector(rot_vec_2)
+        # rot_mat_diff = rot_mat_1.transpose(-2, -1) @ rot_mat_2
+        # return torch.norm(so3.log(rot_mat_diff, as_skew=True), p='fro', dim=(-2, -1))
+        diff_rot = so3.compose_rotations(-rot_vec_1, rot_vec_2)
+        return diff_rot.square().sum(dim=-1)
+    def compute_loss(self, pred, z0, z1, zt, t, batch_mask, reduce='mean', eps=5e-2):
+        """ Compute loss per sample. """
+        assert reduce in {'mean', 'sum', 'none'}
+        zt_dot = self.logarithm_map(zt, z1) / torch.clamp(1 - t, min=eps)[batch_mask]
+        # TODO: do I need this?
+        # pred_at_id = self.logarithm_map(zt, pred) / torch.clamp(1 - t, min=eps)[batch_mask]
+        loss = torch.sum((pred - zt_dot)**2, dim=-1)  # TODO: is this the right loss in SO3?
+        # loss = self.d_R_squared_SO3(zt_dot, pred)
+        if reduce == 'mean':
+            loss = scatter_mean(loss, batch_mask, dim=0)
+        elif reduce == 'sum':
+            loss = scatter_add(loss, batch_mask, dim=0)
+        return loss
+#################
+# Predicting z1 #
+#################
+class CoordICFMPredictFinal(CoordICFM):
+    def __init__(self, sigma):
+        self.dim = 3
+        super().__init__(sigma)
+    def sample_zt_given_zs(self, zs, z1_minus_zs_pred, s, t, batch_mask):
+        """ Perform an explicit Euler step. """
+        # step_size = t - s
+        # zt = zs + step_size[batch_mask] * z1_minus_zs_pred / (1.0 - s)[batch_mask]
+        # for numerical stability
+        step_size = (t - s) / (1.0 - s)
+        assert torch.all(step_size <= 1.0)
+        # step_size = torch.clamp(step_size, max=1.0)
+        zt = zs + step_size[batch_mask] * z1_minus_zs_pred
+        return zt
+    def compute_loss(self, z1_minus_zt_pred, z0, z1, t, batch_mask, reduce='mean'):
+        """ Compute loss per sample. """
+        assert reduce in {'mean', 'sum', 'none'}
+        t = torch.clamp(t, max=0.9)
+        zt = self.sample_zt(z0, z1, t, batch_mask)
+        loss = torch.sum((z1_minus_zt_pred + zt - z1) ** 2, dim=-1) / torch.square(1 - t)[batch_mask].squeeze()
+        if reduce == 'mean':
+            loss = scatter_mean(loss / self.dim, batch_mask, dim=0)
+        elif reduce == 'sum':
+            loss = scatter_add(loss, batch_mask, dim=0)
+        return loss
+    def get_z1_given_zt_and_pred(self, zt, z1_minus_zt_pred, z0, t, batch_mask):
+        return z1_minus_zt_pred + zt
+class TorusICFMPredictFinal(TorusICFM):
+    """
+    Following:
+    Chen, Ricky TQ, and Yaron Lipman.
+    "Riemannian flow matching on general geometries."
+    arXiv preprint arXiv:2302.03660 (2023).
+    """
+    def __init__(self, sigma, dim):
+        super().__init__(sigma, dim)
+    def get_z1_given_zt_and_pred(self, zt, z1_tangent_pred, z0, t, batch_mask):
+        """ Make a best guess on the final state z1 given the current state and
+        the network prediction. """
+        # exponential map
+        return self.exponential_map(zt, z1_tangent_pred)
+    def sample_zt_given_zs(self, zs, z1_tangent_pred, s, t, batch_mask):
+        """ Perform update, typically using an explicit Euler step. """
+        # step_size = t - s
+        # zt_tangent = step_size[batch_mask] * z1_tangent_pred / (1.0 - s)[batch_mask]
+        # for numerical stability
+        step_size = (t - s) / (1.0 - s)
+        assert torch.all(step_size <= 1.0)
+        # step_size = torch.clamp(step_size, max=1.0)
+        zt_tangent = step_size[batch_mask] * z1_tangent_pred
+        # exponential map
+        return self.exponential_map(zs, zt_tangent)
+    def compute_loss(self, z1_tangent_pred, z0, z1, t, batch_mask, reduce='mean'):
+        """ Compute loss per sample. """
+        assert reduce in {'mean', 'sum', 'none'}
+        zt = self.sample_zt(z0, z1, t, batch_mask)
+        t = torch.clamp(t, max=0.9)
+        mask = ~torch.isnan(z1)
+        z1 = torch.nan_to_num(z1, nan=0.0)
+        loss = mask * (z1_tangent_pred - self.logarithm_map(zt, z1)) ** 2
+        loss = torch.sum(loss, dim=-1) / torch.square(1 - t)[batch_mask].squeeze()
+        if reduce == 'mean':
+            denom = mask.sum(dim=-1) + 1e-6
+            loss = scatter_mean(loss / denom, batch_mask, dim=0)
+        elif reduce == 'sum':
+            loss = scatter_add(loss, batch_mask, dim=0)
+        return loss

src/model/gvp.py ADDED Viewed

	@@ -0,0 +1,650 @@

+"""
+Geometric Vector Perceptron implementation taken from:
+https://github.com/drorlab/gvp-pytorch/blob/main/gvp/__init__.py
+"""
+import copy
+import warnings
+import torch, functools
+from torch import nn
+import torch.nn.functional as F
+from torch_geometric.nn import MessagePassing
+from torch_scatter import scatter_add, scatter_mean
+def tuple_sum(*args):
+    '''
+    Sums any number of tuples (s, V) elementwise.
+    '''
+    return tuple(map(sum, zip(*args)))
+def tuple_cat(*args, dim=-1):
+    '''
+    Concatenates any number of tuples (s, V) elementwise.
+    :param dim: dimension along which to concatenate when viewed
+                as the `dim` index for the scalar-channel tensors.
+                This means that `dim=-1` will be applied as
+                `dim=-2` for the vector-channel tensors.
+    '''
+    dim %= len(args[0][0].shape)
+    s_args, v_args = list(zip(*args))
+    return torch.cat(s_args, dim=dim), torch.cat(v_args, dim=dim)
+def tuple_index(x, idx):
+    '''
+    Indexes into a tuple (s, V) along the first dimension.
+    :param idx: any object which can be used to index into a `torch.Tensor`
+    '''
+    return x[0][idx], x[1][idx]
+def randn(n, dims, device="cpu"):
+    '''
+    Returns random tuples (s, V) drawn elementwise from a normal distribution.
+    :param n: number of data points
+    :param dims: tuple of dimensions (n_scalar, n_vector)
+    :return: (s, V) with s.shape = (n, n_scalar) and
+             V.shape = (n, n_vector, 3)
+    '''
+    return torch.randn(n, dims[0], device=device), \
+           torch.randn(n, dims[1], 3, device=device)
+def _norm_no_nan(x, axis=-1, keepdims=False, eps=1e-8, sqrt=True):
+    '''
+    L2 norm of tensor clamped above a minimum value `eps`.
+    :param sqrt: if `False`, returns the square of the L2 norm
+    '''
+    out = torch.clamp(torch.sum(torch.square(x), axis, keepdims), min=eps)
+    return torch.sqrt(out) if sqrt else out
+def _split(x, nv):
+    '''
+    Splits a merged representation of (s, V) back into a tuple.
+    Should be used only with `_merge(s, V)` and only if the tuple
+    representation cannot be used.
+    :param x: the `torch.Tensor` returned from `_merge`
+    :param nv: the number of vector channels in the input to `_merge`
+    '''
+    v = torch.reshape(x[..., -3 * nv:], x.shape[:-1] + (nv, 3))
+    s = x[..., :-3 * nv]
+    return s, v
+def _merge(s, v):
+    '''
+    Merges a tuple (s, V) into a single `torch.Tensor`, where the
+    vector channels are flattened and appended to the scalar channels.
+    Should be used only if the tuple representation cannot be used.
+    Use `_split(x, nv)` to reverse.
+    '''
+    v = torch.reshape(v, v.shape[:-2] + (3 * v.shape[-2],))
+    return torch.cat([s, v], -1)
+class GVP(nn.Module):
+    '''
+    Geometric Vector Perceptron. See manuscript and README.md
+    for more details.
+    :param in_dims: tuple (n_scalar, n_vector)
+    :param out_dims: tuple (n_scalar, n_vector)
+    :param h_dim: intermediate number of vector channels, optional
+    :param activations: tuple of functions (scalar_act, vector_act)
+    :param vector_gate: whether to use vector gating.
+                        (vector_act will be used as sigma^+ in vector gating if `True`)
+    '''
+    def __init__(self, in_dims, out_dims, h_dim=None,
+                 activations=(F.relu, torch.sigmoid), vector_gate=False):
+        super(GVP, self).__init__()
+        self.si, self.vi = in_dims
+        self.so, self.vo = out_dims
+        self.vector_gate = vector_gate
+        if self.vi:
+            self.h_dim = h_dim or max(self.vi, self.vo)
+            self.wh = nn.Linear(self.vi, self.h_dim, bias=False)
+            self.ws = nn.Linear(self.h_dim + self.si, self.so)
+            if self.vo:
+                self.wv = nn.Linear(self.h_dim, self.vo, bias=False)
+                if self.vector_gate: self.wsv = nn.Linear(self.so, self.vo)
+        else:
+            self.ws = nn.Linear(self.si, self.so)
+        self.scalar_act, self.vector_act = activations
+        self.dummy_param = nn.Parameter(torch.empty(0))
+    def forward(self, x):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`,
+                  or (if vectors_in is 0), a single `torch.Tensor`
+        :return: tuple (s, V) of `torch.Tensor`,
+                 or (if vectors_out is 0), a single `torch.Tensor`
+        '''
+        if self.vi:
+            s, v = x
+            v = torch.transpose(v, -1, -2)
+            vh = self.wh(v)
+            vn = _norm_no_nan(vh, axis=-2)
+            s = self.ws(torch.cat([s, vn], -1))
+            if self.vo:
+                v = self.wv(vh)
+                v = torch.transpose(v, -1, -2)
+                if self.vector_gate:
+                    if self.vector_act:
+                        gate = self.wsv(self.vector_act(s))
+                    else:
+                        gate = self.wsv(s)
+                    v = v * torch.sigmoid(gate).unsqueeze(-1)
+                elif self.vector_act:
+                    v = v * self.vector_act(
+                        _norm_no_nan(v, axis=-1, keepdims=True))
+        else:
+            s = self.ws(x)
+            if self.vo:
+                v = torch.zeros(s.shape[0], self.vo, 3,
+                                device=self.dummy_param.device)
+        if self.scalar_act:
+            s = self.scalar_act(s)
+        return (s, v) if self.vo else s
+class _VDropout(nn.Module):
+    '''
+    Vector channel dropout where the elements of each
+    vector channel are dropped together.
+    '''
+    def __init__(self, drop_rate):
+        super(_VDropout, self).__init__()
+        self.drop_rate = drop_rate
+        self.dummy_param = nn.Parameter(torch.empty(0))
+    def forward(self, x):
+        '''
+        :param x: `torch.Tensor` corresponding to vector channels
+        '''
+        device = self.dummy_param.device
+        if not self.training:
+            return x
+        mask = torch.bernoulli(
+            (1 - self.drop_rate) * torch.ones(x.shape[:-1], device=device)
+        ).unsqueeze(-1)
+        x = mask * x / (1 - self.drop_rate)
+        return x
+class Dropout(nn.Module):
+    '''
+    Combined dropout for tuples (s, V).
+    Takes tuples (s, V) as input and as output.
+    '''
+    def __init__(self, drop_rate):
+        super(Dropout, self).__init__()
+        self.sdropout = nn.Dropout(drop_rate)
+        self.vdropout = _VDropout(drop_rate)
+    def forward(self, x):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`,
+                  or single `torch.Tensor`
+                  (will be assumed to be scalar channels)
+        '''
+        if type(x) is torch.Tensor:
+            return self.sdropout(x)
+        s, v = x
+        return self.sdropout(s), self.vdropout(v)
+class LayerNorm(nn.Module):
+    '''
+    Combined LayerNorm for tuples (s, V).
+    Takes tuples (s, V) as input and as output.
+    '''
+    def __init__(self, dims, learnable_vector_weight=False):
+        super(LayerNorm, self).__init__()
+        self.s, self.v = dims
+        self.scalar_norm = nn.LayerNorm(self.s)
+        self.vector_norm = VectorLayerNorm(self.v, learnable_vector_weight) \
+            if self.v > 0 else None
+    def forward(self, x):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`,
+                  or single `torch.Tensor`
+                  (will be assumed to be scalar channels)
+        '''
+        if not self.v:
+            return self.scalar_norm(x)
+        s, v = x
+        # vn = _norm_no_nan(v, axis=-1, keepdims=True, sqrt=False)
+        # vn = torch.sqrt(torch.mean(vn, dim=-2, keepdim=True))
+        # return self.scalar_norm(s), v / vn
+        return self.scalar_norm(s), self.vector_norm(v)
+class VectorLayerNorm(nn.Module):
+    """
+    Equivariant normalization of vector-valued features inspired by:
+    Liao, Yi-Lun, and Tess Smidt.
+    "Equiformer: Equivariant graph attention transformer for 3d atomistic graphs."
+    arXiv preprint arXiv:2206.11990 (2022).
+    Section 4.1, "Layer Normalization"
+    """
+    def __init__(self, n_channels, learnable_weight=True):
+        super(VectorLayerNorm, self).__init__()
+        self.gamma = nn.Parameter(torch.ones(1, n_channels, 1)) \
+            if learnable_weight else None                            # (1, c, 1)
+    def forward(self, x):
+        """
+        Computes LN(x) = ( x / RMS( L2-norm(x) ) ) * gamma
+        :param x: input tensor (n, c, 3)
+        :return: layer normalized vector feature
+        """
+        norm2 = _norm_no_nan(x, axis=-1, keepdims=True, sqrt=False)  # (n, c, 1)
+        rms = torch.sqrt(torch.mean(norm2, dim=-2, keepdim=True))    # (n, 1, 1)
+        x = x / rms                                                  # (n, c, 3)
+        if self.gamma is not None:
+            x = x * self.gamma
+        return x
+class GVPConv(MessagePassing):
+    '''
+    Graph convolution / message passing with Geometric Vector Perceptrons.
+    Takes in a graph with node and edge embeddings,
+    and returns new node embeddings.
+    This does NOT do residual updates and pointwise feedforward layers
+    ---see `GVPConvLayer`.
+    :param in_dims: input node embedding dimensions (n_scalar, n_vector)
+    :param out_dims: output node embedding dimensions (n_scalar, n_vector)
+    :param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
+    :param n_layers: number of GVPs in the message function
+    :param module_list: preconstructed message function, overrides n_layers
+    :param aggr: should be "add" if some incoming edges are masked, as in
+                 a masked autoregressive decoder architecture, otherwise "mean"
+    :param activations: tuple of functions (scalar_act, vector_act) to use in GVPs
+    :param vector_gate: whether to use vector gating.
+                        (vector_act will be used as sigma^+ in vector gating if `True`)
+    :param update_edge_attr: whether to compute an updated edge representation
+    '''
+    def __init__(self, in_dims, out_dims, edge_dims,
+                 n_layers=3, module_list=None, aggr="mean",
+                 activations=(F.relu, torch.sigmoid), vector_gate=False,
+                 update_edge_attr=False):
+        super(GVPConv, self).__init__(aggr=aggr)
+        self.si, self.vi = in_dims
+        self.so, self.vo = out_dims
+        self.se, self.ve = edge_dims
+        self.update_edge_attr = update_edge_attr
+        GVP_ = functools.partial(GVP,
+                                 activations=activations,
+                                 vector_gate=vector_gate)
+        module_list = module_list or []
+        if not module_list:
+            if n_layers == 1:
+                module_list.append(
+                    GVP_((2 * self.si + self.se, 2 * self.vi + self.ve),
+                         (self.so, self.vo), activations=(None, None)))
+            else:
+                module_list.append(
+                    GVP_((2 * self.si + self.se, 2 * self.vi + self.ve),
+                         out_dims)
+                )
+                for i in range(n_layers - 2):
+                    module_list.append(GVP_(out_dims, out_dims))
+                module_list.append(GVP_(out_dims, out_dims,
+                                        activations=(None, None)))
+        self.message_func = nn.Sequential(*module_list)
+        self.edge_func = copy.deepcopy(self.message_func) \
+            if self.update_edge_attr else None
+    def forward(self, x, edge_index, edge_attr):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`
+        :param edge_index: array of shape [2, n_edges]
+        :param edge_attr: tuple (s, V) of `torch.Tensor`
+        '''
+        x_s, x_v = x
+        message = self.propagate(edge_index,
+                                 s=x_s,
+                                 v=x_v.reshape(x_v.shape[0], 3 * x_v.shape[1]),
+                                 edge_attr=edge_attr)
+        if self.update_edge_attr:
+            s_i, s_j = x_s[edge_index[0]], x_s[edge_index[1]]
+            x_v = x_v.reshape(x_v.shape[0], 3 * x_v.shape[1])
+            v_i, v_j = x_v[edge_index[0]], x_v[edge_index[1]]
+            edge_out = self.edge_attr(s_i, v_i, s_j, v_j, edge_attr)
+            return _split(message, self.vo), edge_out
+        else:
+            return _split(message, self.vo)
+    def message(self, s_i, v_i, s_j, v_j, edge_attr):
+        v_j = v_j.view(v_j.shape[0], v_j.shape[1] // 3, 3)
+        v_i = v_i.view(v_i.shape[0], v_i.shape[1] // 3, 3)
+        message = tuple_cat((s_j, v_j), edge_attr, (s_i, v_i))
+        message = self.message_func(message)
+        return _merge(*message)
+    def edge_attr(self, s_i, v_i, s_j, v_j, edge_attr):
+        v_j = v_j.view(v_j.shape[0], v_j.shape[1] // 3, 3)
+        v_i = v_i.view(v_i.shape[0], v_i.shape[1] // 3, 3)
+        message = tuple_cat((s_j, v_j), edge_attr, (s_i, v_i))
+        return self.edge_func(message)
+class GVPConvLayer(nn.Module):
+    '''
+    Full graph convolution / message passing layer with
+    Geometric Vector Perceptrons. Residually updates node embeddings with
+    aggregated incoming messages, applies a pointwise feedforward
+    network to node embeddings, and returns updated node embeddings.
+    To only compute the aggregated messages, see `GVPConv`.
+    :param node_dims: node embedding dimensions (n_scalar, n_vector)
+    :param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
+    :param n_message: number of GVPs to use in message function
+    :param n_feedforward: number of GVPs to use in feedforward function
+    :param drop_rate: drop probability in all dropout layers
+    :param autoregressive: if `True`, this `GVPConvLayer` will be used
+           with a different set of input node embeddings for messages
+           where src >= dst
+    :param activations: tuple of functions (scalar_act, vector_act) to use in GVPs
+    :param vector_gate: whether to use vector gating.
+                        (vector_act will be used as sigma^+ in vector gating if `True`)
+    :param update_edge_attr: whether to compute an updated edge representation
+    :param ln_vector_weight: whether to include a learnable weight in the vector
+                             layer norm
+    '''
+    def __init__(self, node_dims, edge_dims,
+                 n_message=3, n_feedforward=2, drop_rate=.1,
+                 autoregressive=False,
+                 activations=(F.relu, torch.sigmoid), vector_gate=False,
+                 update_edge_attr=False, ln_vector_weight=False):
+        super(GVPConvLayer, self).__init__()
+        assert not (update_edge_attr and autoregressive), "Not implemented"
+        self.update_edge_attr = update_edge_attr
+        self.conv = GVPConv(node_dims, node_dims, edge_dims, n_message,
+                            aggr="add" if autoregressive else "mean",
+                            activations=activations, vector_gate=vector_gate,
+                            update_edge_attr=update_edge_attr)
+        GVP_ = functools.partial(GVP,
+                                 activations=activations,
+                                 vector_gate=vector_gate)
+        self.norm = nn.ModuleList([LayerNorm(node_dims, ln_vector_weight)
+                                   for _ in range(2)])
+        self.dropout = nn.ModuleList([Dropout(drop_rate) for _ in range(2)])
+        def get_feedforward(n_dims):
+            ff_func = []
+            if n_feedforward == 1:
+                ff_func.append(GVP_(n_dims, n_dims, activations=(None, None)))
+            else:
+                hid_dims = 4 * n_dims[0], 2 * n_dims[1]
+                ff_func.append(GVP_(n_dims, hid_dims))
+                for i in range(n_feedforward - 2):
+                    ff_func.append(GVP_(hid_dims, hid_dims))
+                ff_func.append(GVP_(hid_dims, n_dims, activations=(None, None)))
+            return nn.Sequential(*ff_func)
+        self.ff_func = get_feedforward(node_dims)
+        if self.update_edge_attr:
+            self.edge_norm = nn.ModuleList([LayerNorm(edge_dims, ln_vector_weight)
+                                            for _ in range(2)])
+            self.edge_dropout = nn.ModuleList([Dropout(drop_rate) for _ in range(2)])
+            self.edge_ff = get_feedforward(edge_dims)
+    def forward(self, x, edge_index, edge_attr,
+                autoregressive_x=None, node_mask=None):
+        '''
+        :param x: tuple (s, V) of `torch.Tensor`
+        :param edge_index: array of shape [2, n_edges]
+        :param edge_attr: tuple (s, V) of `torch.Tensor`
+        :param autoregressive_x: tuple (s, V) of `torch.Tensor`.
+                If not `None`, will be used as src node embeddings
+                for forming messages where src >= dst. The corrent node
+                embeddings `x` will still be the base of the update and the
+                pointwise feedforward.
+        :param node_mask: array of type `bool` to index into the first
+                dim of node embeddings (s, V). If not `None`, only
+                these nodes will be updated.
+        '''
+        if autoregressive_x is not None:
+            src, dst = edge_index
+            mask = src < dst
+            edge_index_forward = edge_index[:, mask]
+            edge_index_backward = edge_index[:, ~mask]
+            edge_attr_forward = tuple_index(edge_attr, mask)
+            edge_attr_backward = tuple_index(edge_attr, ~mask)
+            dh = tuple_sum(
+                self.conv(x, edge_index_forward, edge_attr_forward),
+                self.conv(autoregressive_x, edge_index_backward,
+                          edge_attr_backward)
+            )
+            count = scatter_add(torch.ones_like(dst), dst,
+                                dim_size=dh[0].size(0)).clamp(min=1).unsqueeze(
+                -1)
+            dh = dh[0] / count, dh[1] / count.unsqueeze(-1)
+        else:
+            dh = self.conv(x, edge_index, edge_attr)
+        if self.update_edge_attr:
+            dh, de = dh
+            edge_attr = self.edge_norm[0](tuple_sum(edge_attr, self.dropout[0](de)))
+            de = self.edge_ff(edge_attr)
+            edge_attr = self.edge_norm[1](tuple_sum(edge_attr, self.dropout[1](de)))
+        if node_mask is not None:
+            x_ = x
+            x, dh = tuple_index(x, node_mask), tuple_index(dh, node_mask)
+        x = self.norm[0](tuple_sum(x, self.dropout[0](dh)))
+        dh = self.ff_func(x)
+        x = self.norm[1](tuple_sum(x, self.dropout[1](dh)))
+        if node_mask is not None:
+            x_[0][node_mask], x_[1][node_mask] = x[0], x[1]
+            x = x_
+        return (x, edge_attr) if self.update_edge_attr else x
+################################################################################
+def _normalize(tensor, dim=-1, eps=1e-8):
+    '''
+    Normalizes a `torch.Tensor` along dimension `dim` without `nan`s.
+    '''
+    return torch.nan_to_num(
+        torch.div(tensor, torch.norm(tensor, dim=dim, keepdim=True) + eps))
+def _rbf(D, D_min=0., D_max=20., D_count=16, device='cpu'):
+    '''
+    From https://github.com/jingraham/neurips19-graph-protein-design
+    Returns an RBF embedding of `torch.Tensor` `D` along a new axis=-1.
+    That is, if `D` has shape [...dims], then the returned tensor will have
+    shape [...dims, D_count].
+    '''
+    D_mu = torch.linspace(D_min, D_max, D_count, device=device)
+    D_mu = D_mu.view([1, -1])
+    D_sigma = (D_max - D_min) / D_count
+    D_expand = torch.unsqueeze(D, -1)
+    RBF = torch.exp(-((D_expand - D_mu) / D_sigma) ** 2)
+    return RBF
+class GVPModel(torch.nn.Module):
+    """
+    GVP-GNN model
+    inspired by: https://github.com/drorlab/gvp-pytorch/blob/main/gvp/models.py
+    and: https://github.com/drorlab/gvp-pytorch/blob/82af6b22eaf8311c15733117b0071408d24ed877/gvp/atom3d.py#L115
+    :param node_in_dim: node dimension in input graph, scalars or tuple (scalars, vectors)
+    :param node_h_dim: node dimensions to use in GVP-GNN layers, tuple (s, V)
+    :param node_out_nf: node dimensions in output graph, tuple (s, V)
+    :param edge_in_nf: edge dimension in input graph (scalars)
+    :param edge_h_dim: edge dimensions to embed to before use in GVP-GNN layers,
+        tuple (s, V)
+    :param edge_out_nf: edge dimensions in output graph, tuple (s, V)
+    :param num_layers: number of GVP-GNN layers
+    :param drop_rate: rate to use in all dropout layers
+    :param vector_gate: use vector gates in all GVPs
+    :param reflection_equiv: bool, use reflection-sensitive feature based on the
+        cross product if False
+    :param d_max:
+    :param num_rbf:
+    :param update_edge_attr: bool, update edge attributes at each layer in a
+        learnable way
+    """
+    def __init__(self, node_in_dim, node_h_dim, node_out_nf,
+                 edge_in_nf, edge_h_dim, edge_out_nf,
+                 num_layers=3, drop_rate=0.1, vector_gate=False,
+                 reflection_equiv=True, d_max=20.0, num_rbf=16,
+                 update_edge_attr=False):
+        super(GVPModel, self).__init__()
+        self.reflection_equiv = reflection_equiv
+        self.update_edge_attr = update_edge_attr
+        self.d_max = d_max
+        self.num_rbf = num_rbf
+        # node_in_dim = (node_in_dim, 1)
+        if not isinstance(node_in_dim, tuple):
+            node_in_dim = (node_in_dim, 0)
+        edge_in_dim = (edge_in_nf + 2 * node_in_dim[0] + self.num_rbf, 1)
+        if not self.reflection_equiv:
+            edge_in_dim = (edge_in_dim[0], edge_in_dim[1] + 1)
+        # self.W_v = nn.Sequential(
+        #     GVP(node_in_dim, node_h_dim, activations=(None, None), vector_gate=True),
+        #     LayerNorm(node_h_dim)
+        # )
+        self.W_v = nn.Sequential(
+            LayerNorm(node_in_dim, learnable_vector_weight=True),
+            GVP(node_in_dim, node_h_dim, activations=(None, None), vector_gate=vector_gate),
+        )
+        # self.W_e = nn.Sequential(
+        #     GVP(edge_in_dim, edge_h_dim, activations=(None, None), vector_gate=True),
+        #     LayerNorm(edge_h_dim)
+        # )
+        self.W_e = nn.Sequential(
+            LayerNorm(edge_in_dim, learnable_vector_weight=True),
+            GVP(edge_in_dim, edge_h_dim, activations=(None, None), vector_gate=vector_gate),
+        )
+        self.layers = nn.ModuleList(
+            GVPConvLayer(node_h_dim, edge_h_dim, drop_rate=drop_rate,
+                         update_edge_attr=self.update_edge_attr,
+                         activations=(F.relu, None), vector_gate=vector_gate,
+                         ln_vector_weight=True)
+                         # activations=(F.relu, torch.sigmoid))
+            # GVPConvLayer(node_h_dim, edge_h_dim, drop_rate=drop_rate,
+            #              update_edge_attr=self.update_edge_attr,
+            #              activations=(nn.SiLU(), nn.SiLU()))
+            for _ in range(num_layers))
+        # self.W_v_out = GVP(node_h_dim, (node_out_nf, 1),
+        #                    activations=(None, None), vector_gate=True)
+        self.W_v_out = nn.Sequential(
+            LayerNorm(node_h_dim, learnable_vector_weight=True),
+            GVP(node_h_dim, (node_out_nf, 1), activations=(None, None), vector_gate=vector_gate),
+        )
+        # self.W_e_out = GVP(edge_h_dim, (edge_out_nf, 0),
+        #                    activations=(None, None), vector_gate=True) \
+        #     if self.update_edge_attr else None
+        self.W_e_out = nn.Sequential(
+            LayerNorm(edge_h_dim, learnable_vector_weight=True),
+            GVP(edge_h_dim, (edge_out_nf, 0), activations=(None, None), vector_gate=vector_gate)
+        ) if self.update_edge_attr else None
+    def edge_features(self, h, x, edge_index, batch_mask=None, edge_attr=None):
+        """
+        :param h:
+        :param x:
+        :param edge_index:
+        :param batch_mask:
+        :param edge_attr:
+        :return: scalar and vector-valued edge features
+        """
+        row, col = edge_index
+        coord_diff = x[row] - x[col]
+        dist = coord_diff.norm(dim=-1)
+        rbf = _rbf(dist, D_max=self.d_max, D_count=self.num_rbf,
+                   device=x.device)
+        edge_s = torch.cat([h[row], h[col], rbf], dim=1)
+        edge_v = _normalize(coord_diff).unsqueeze(-2)
+        if edge_attr is not None:
+            edge_s = torch.cat([edge_s, edge_attr], dim=1)
+        if not self.reflection_equiv:
+            mean = scatter_mean(x, batch_mask, dim=0,
+                                dim_size=batch_mask.max() + 1)
+            row, col = edge_index
+            cross = torch.cross(x[row] - mean[batch_mask[row]],
+                                x[col] - mean[batch_mask[col]], dim=1)
+            cross = _normalize(cross).unsqueeze(-2)
+            edge_v = torch.cat([edge_v, cross], dim=-2)
+        return torch.nan_to_num(edge_s), torch.nan_to_num(edge_v)
+    def forward(self, h, x, edge_index, v=None, batch_mask=None, edge_attr=None):
+        # h_v = (h, x.unsqueeze(-2))
+        h_v = h if v is None else (h, v)
+        h_e = self.edge_features(h, x, edge_index, batch_mask, edge_attr)
+        h_v = self.W_v(h_v)
+        h_e = self.W_e(h_e)
+        for layer in self.layers:
+            h_v = layer(h_v, edge_index, edge_attr=h_e)
+            if self.update_edge_attr:
+                h_v, h_e = h_v
+        # h, x = self.W_v_out(h_v)
+        # x = x.squeeze(-2)
+        h, vel = self.W_v_out(h_v)
+        # x = x + vel.squeeze(-2)
+        if self.update_edge_attr:
+            edge_attr = self.W_e_out(h_e)
+        # return h, x, edge_attr
+        return h, vel.squeeze(-2), edge_attr

src/model/gvp_transformer.py ADDED Viewed

	@@ -0,0 +1,471 @@

+import math
+import functools
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch_scatter import scatter_mean, scatter_std, scatter_min, scatter_max, scatter_softmax
+# ## debug
+# import sys
+# from pathlib import Path
+#
+# basedir = Path(__file__).resolve().parent.parent.parent
+# sys.path.append(str(basedir))
+# ###
+from src.model.gvp import GVP, _norm_no_nan, tuple_sum, Dropout, LayerNorm, \
+    tuple_cat, tuple_index, _rbf, _normalize
+def tuple_mul(tup, val):
+    if isinstance(val, torch.Tensor):
+        return (tup[0] * val, tup[1] * val.unsqueeze(-1))
+    return (tup[0] * val, tup[1] * val)
+class GVPBlock(nn.Module):
+    def __init__(self, in_dims, out_dims, n_layers=1,
+                 activations=(F.relu, torch.sigmoid), vector_gate=False,
+                 dropout=0.0, skip=False, layernorm=False):
+        super(GVPBlock, self).__init__()
+        self.si, self.vi = in_dims
+        self.so, self.vo = out_dims
+        assert not skip or (self.si == self.so and self.vi == self.vo)
+        self.skip = skip
+        GVP_ = functools.partial(GVP, activations=activations, vector_gate=vector_gate)
+        module_list = []
+        if n_layers == 1:
+            module_list.append(GVP_(in_dims, out_dims, activations=(None, None)))
+        else:
+            module_list.append(GVP_(in_dims, out_dims))
+            for i in range(n_layers - 2):
+                module_list.append(GVP_(out_dims, out_dims))
+            module_list.append(GVP_(out_dims, out_dims, activations=(None, None)))
+        self.layers = nn.Sequential(*module_list)
+        self.norm = LayerNorm(out_dims, learnable_vector_weight=True) if layernorm else None
+        self.dropout = Dropout(dropout) if dropout > 0 else None
+    def forward(self, x):
+        """
+        :param x: tuple (s, V) of `torch.Tensor`
+        :return: tuple (s, V) of `torch.Tensor`
+        """
+        dx = self.layers(x)
+        if self.dropout is not None:
+            dx = self.dropout(dx)
+        if self.skip:
+            x = tuple_sum(x, dx)
+        else:
+            x = dx
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+class GeometricPNA(nn.Module):
+    def __init__(self, d_in, d_out):
+        """ Map features to global features """
+        super().__init__()
+        si, vi = d_in
+        so, vo = d_out
+        self.gvp = GVPBlock((4 * si + 3 * vi, vi), d_out)
+    def forward(self, x, batch_mask, batch_size=None):
+        """ x: tuple (s, V) """
+        s, v = x
+        sm = scatter_mean(s, batch_mask, dim=0, dim_size=batch_size)
+        smi = scatter_min(s, batch_mask, dim=0, dim_size=batch_size)[0]
+        sma = scatter_max(s, batch_mask, dim=0, dim_size=batch_size)[0]
+        sstd = scatter_std(s, batch_mask, dim=0, dim_size=batch_size)
+        vnorm = _norm_no_nan(v)
+        vm = scatter_mean(v, batch_mask, dim=0, dim_size=batch_size)
+        vmi = scatter_min(vnorm, batch_mask, dim=0, dim_size=batch_size)[0]
+        vma = scatter_max(vnorm, batch_mask, dim=0, dim_size=batch_size)[0]
+        vstd = scatter_std(vnorm, batch_mask, dim=0, dim_size=batch_size)
+        z = torch.hstack((sm, smi, sma, sstd, vmi, vma, vstd))
+        out = self.gvp((z, vm))
+        return out
+class TupleLinear(nn.Module):
+    def __init__(self, in_dims, out_dims, bias=True):
+        super().__init__()
+        self.si, self.vi = in_dims
+        self.so, self.vo = out_dims
+        assert self.si and self.so
+        self.ws = nn.Linear(self.si, self.so, bias=bias)
+        self.wv = nn.Linear(self.vi, self.vo, bias=bias) if self.vi and self.vo else None
+    def forward(self, x):
+        if self.vi:
+            s, v = x
+            s = self.ws(s)
+            if self.vo:
+                v = v.transpose(-1, -2)
+                v = self.wv(v)
+                v = v.transpose(-1, -2)
+        else:
+            s = self.ws(x)
+            if self.vo:
+                v = torch.zeros(s.size(0), self.vo, 3, device=s.device)
+        return (s, v) if self.vo else s
+class GVPTransformerLayer(nn.Module):
+    """
+    Full graph transformer layer with Geometric Vector Perceptrons.
+    Inspired by
+    - GVP: Jing, Bowen, et al. "Learning from protein structure with geometric vector perceptrons." arXiv preprint arXiv:2009.01411 (2020).
+    - Transformer architecture: Vignac, Clement, et al. "Digress: Discrete denoising diffusion for graph generation." arXiv preprint arXiv:2209.14734 (2022).
+    - Invariant point attention: Jumper, John, et al. "Highly accurate protein structure prediction with AlphaFold." Nature 596.7873 (2021): 583-589.
+    :param node_dims: node embedding dimensions (n_scalar, n_vector)
+    :param edge_dims: input edge embedding dimensions (n_scalar, n_vector)
+    :param global_dims: global feature dimension (n_scalar, n_vector)
+    :param dk: key dimension, (n_scalar, n_vector)
+    :param dv: node value dimension, (n_scalar, n_vector)
+    :param de: edge value dimension, (n_scalar, n_vector)
+    :param db: dimension of edge contribution to attention, int
+    :param attn_heads: number of attention heads, int
+    :param n_feedforward: number of GVPs to use in feedforward function
+    :param drop_rate: drop probability in all dropout layers
+    :param activations: tuple of functions (scalar_act, vector_act) to use in GVPs
+    :param vector_gate: whether to use vector gating.
+                        (vector_act will be used as sigma^+ in vector gating if `True`)
+    :param attention: can be used to turn off the attention mechanism
+    """
+    def __init__(self, node_dims, edge_dims, global_dims, dk, dv, de, db,
+                 attn_heads, n_feedforward=1, drop_rate=0.0,
+                 activations=(F.relu, torch.sigmoid), vector_gate=False,
+                 attention=True):
+        super(GVPTransformerLayer, self).__init__()
+        self.attention = attention
+        dq = dk
+        self.dq = dq
+        self.dk = dk
+        self.dv = dv
+        self.de = de
+        self.db = db
+        self.h = attn_heads
+        self.q = TupleLinear(node_dims, tuple_mul(dq, self.h), bias=False) if self.attention else None
+        self.k = TupleLinear(node_dims, tuple_mul(dk, self.h), bias=False) if self.attention else None
+        self.vx = TupleLinear(node_dims, tuple_mul(dv, self.h), bias=False)
+        self.ve = TupleLinear(edge_dims, tuple_mul(de, self.h), bias=False)
+        self.b = TupleLinear(edge_dims, (db * self.h, 0), bias=False) if self.attention else None
+        m_dim = tuple_sum(tuple_mul(dv, self.h), tuple_mul(de, self.h))
+        self.msg = GVPBlock(m_dim, m_dim, n_feedforward,
+                            activations=activations, vector_gate=vector_gate)
+        m_dim = tuple_sum(m_dim, global_dims)
+        self.x_out = GVPBlock(m_dim, node_dims, n_feedforward,
+                              activations=activations, vector_gate=vector_gate)
+        self.x_norm = LayerNorm(node_dims, learnable_vector_weight=True)
+        self.x_dropout = Dropout(drop_rate)
+        e_dim = tuple_sum(tuple_mul(node_dims, 2), edge_dims, global_dims)
+        if self.attention:
+            e_dim = (e_dim[0] + 3 * attn_heads, e_dim[1])
+        self.e_out = GVPBlock(e_dim, edge_dims, n_feedforward,
+                              activations=activations, vector_gate=vector_gate)
+        self.e_norm = LayerNorm(edge_dims, learnable_vector_weight=True)
+        self.e_dropout = Dropout(drop_rate)
+        self.pna_x = GeometricPNA(node_dims, node_dims)
+        self.pna_e = GeometricPNA(edge_dims, edge_dims)
+        self.y = GVP(global_dims, global_dims, activations=(None, None), vector_gate=vector_gate)
+        _dim = tuple_sum(node_dims, edge_dims, global_dims)
+        self.y_out = GVPBlock(_dim, global_dims, n_feedforward,
+                              activations=activations, vector_gate=vector_gate)
+        self.y_norm = LayerNorm(global_dims, learnable_vector_weight=True)
+        self.y_dropout = Dropout(drop_rate)
+    def forward(self, x, edge_index, batch_mask, edge_attr, global_attr=None,
+                node_mask=None):
+        """
+        :param x: tuple (s, V) of `torch.Tensor`
+        :param edge_index: array of shape [2, n_edges]
+        :param batch_mask: array indicating different graphs
+        :param edge_attr: tuple (s, V) of `torch.Tensor`
+        :param global_attr: tuple (s, V) of `torch.Tensor`
+        :param node_mask: array of type `bool` to index into the first
+                dim of node embeddings (s, V). If not `None`, only
+                these nodes will be updated.
+        """
+        row, col = edge_index
+        n = len(x[0])
+        batch_size = len(torch.unique(batch_mask))
+        # Compute attention
+        if self.attention:
+            Q = self.q(x)
+            K = self.k(x)
+            b = self.b(edge_attr)
+            qs, qv = Q  # (n, dq * h), (n, dq * h, 3)
+            ks, kv = K  # (n, dq * h), (n, dq * h, 3)
+            attn_s = (qs[row] * ks[col]).reshape(len(row), self.h, self.dq[0]).sum(dim=-1)  # (m, h)
+            # NOTE: attn_v is the Frobenius inner product between vector-valued queries and keys of size [dq, 3]
+            #  (generalizes the dot-product between queries and keys similar to Pocket2Mol)
+            # TODO: double-check if this is correctly implemented!
+            attn_v = (qv[row] * kv[col]).reshape(len(row), self.h, self.dq[1], 3).sum(dim=(-2, -1))  # (m, h)
+            attn_e = b.reshape(b.size(0), self.h, self.db).sum(dim=-1)  # (m, h)
+            attn = attn_s / math.sqrt(3 * self.dk[0]) + \
+                   attn_v / math.sqrt(9 * self.dk[1]) + \
+                   attn_e / math.sqrt(3 * self.db)
+            attn = scatter_softmax(attn, row, dim=0)  # (m, h)
+            attn = attn.unsqueeze(-1)  # (m, h, 1)
+        # Compute new features
+        Vx = self.vx(x)
+        Ve = self.ve(edge_attr)
+        mx = (Vx[0].reshape(Vx[0].size(0), self.h, self.dv[0]),  # (n, h, dv)
+              Vx[1].reshape(Vx[1].size(0), self.h, self.dv[1], 3))  # (n, h, dv, 3)
+        me = (Ve[0].reshape(Ve[0].size(0), self.h, self.de[0]),
+              Ve[1].reshape(Ve[1].size(0), self.h, self.de[1], 3))
+        mx = tuple_index(mx, col)
+        if self.attention:
+            mx = tuple_mul(mx, attn)
+            me = tuple_mul(me, attn)
+        _m = tuple_cat(mx, me)
+        _m = (_m[0].flatten(1), _m[1].flatten(1, 2))
+        m = self.msg(_m)  # (m, h * dv), (m, h * dv, 3)
+        m = (scatter_mean(m[0], row, dim=0, dim_size=n),  # (n, h * dv)
+             scatter_mean(m[1], row, dim=0, dim_size=n))  # (n, h * dv, 3)
+        if global_attr is not None:
+            m = tuple_cat(m, tuple_index(global_attr, batch_mask))
+        X_out = self.x_norm(tuple_sum(x, self.x_dropout(self.x_out(m))))
+        _e = tuple_cat(tuple_index(x, row), tuple_index(x, col), edge_attr)
+        if self.attention:
+            _e = (torch.cat([_e[0], attn_s, attn_v, attn_e], dim=-1), _e[1])
+        if global_attr is not None:
+            _e = tuple_cat(_e, tuple_index(global_attr, batch_mask[row]))
+        E_out = self.e_norm(tuple_sum(edge_attr, self.e_dropout(self.e_out(_e))))
+        _y = tuple_cat(self.pna_x(x, batch_mask, batch_size),
+                       self.pna_e(edge_attr, batch_mask[row], batch_size))
+        if global_attr is not None:
+            _y = tuple_cat(_y, self.y(global_attr))
+            y_out = self.y_norm(tuple_sum(global_attr, self.y_dropout(self.y_out(_y))))
+        else:
+            y_out = self.y_norm(self.y_dropout(self.y_out(_y)))
+        if node_mask is not None:
+            X_out[0][~node_mask], X_out[1][~node_mask] = tuple_index(x, ~node_mask)
+        return X_out, E_out, y_out
+class GVPTransformerModel(torch.nn.Module):
+    """
+    GVP-Transformer model
+    :param node_in_dim: node dimension in input graph, scalars or tuple (scalars, vectors)
+    :param node_h_dim: node dimensions to use in GVP-GNN layers, tuple (s, V)
+    :param node_out_nf: node dimensions in output graph, tuple (s, V)
+    :param edge_in_nf: edge dimension in input graph (scalars)
+    :param edge_h_dim: edge dimensions to embed to before use in GVP-GNN layers,
+        tuple (s, V)
+    :param edge_out_nf: edge dimensions in output graph, tuple (s, V)
+    :param num_layers: number of GVP-GNN layers
+    :param drop_rate: rate to use in all dropout layers
+    :param reflection_equiv: bool, use reflection-sensitive feature based on the
+        cross product if False
+    :param d_max:
+    :param num_rbf:
+    :param vector_gate: use vector gates in all GVPs
+    :param attention: can be used to turn off the attention mechanism
+    """
+    def __init__(self, node_in_dim, node_h_dim, node_out_nf, edge_in_nf,
+                 edge_h_dim, edge_out_nf, num_layers, dk, dv, de, db, dy,
+                 attn_heads, n_feedforward, drop_rate, reflection_equiv=True,
+                 d_max=20.0, num_rbf=16, vector_gate=False, attention=True):
+        super(GVPTransformerModel, self).__init__()
+        self.reflection_equiv = reflection_equiv
+        self.d_max = d_max
+        self.num_rbf = num_rbf
+        # node_in_dim = (node_in_dim, 1)
+        if not isinstance(node_in_dim, tuple):
+            node_in_dim = (node_in_dim, 0)
+        edge_in_dim = (edge_in_nf + 2 * node_in_dim[0] + self.num_rbf, 1)
+        if not self.reflection_equiv:
+            edge_in_dim = (edge_in_dim[0], edge_in_dim[1] + 1)
+        self.W_v = GVP(node_in_dim, node_h_dim, activations=(None, None), vector_gate=vector_gate)
+        self.W_e = GVP(edge_in_dim, edge_h_dim, activations=(None, None), vector_gate=vector_gate)
+        # self.W_v = nn.Sequential(
+        #     LayerNorm(node_in_dim, learnable_vector_weight=True),
+        #     GVP(node_in_dim, node_h_dim, activations=(None, None)),
+        # )
+        # self.W_e = nn.Sequential(
+        #     LayerNorm(edge_in_dim, learnable_vector_weight=True),
+        #     GVP(edge_in_dim, edge_h_dim, activations=(None, None)),
+        # )
+        self.dy = dy
+        self.layers = nn.ModuleList(
+            GVPTransformerLayer(node_h_dim, edge_h_dim, dy, dk, dv, de, db,
+                                attn_heads, n_feedforward=n_feedforward,
+                                drop_rate=drop_rate, vector_gate=vector_gate,
+                                activations=(F.relu, None), attention=attention)
+            for _ in range(num_layers))
+        self.W_v_out = GVP(node_h_dim, (node_out_nf, 1), activations=(None, None), vector_gate=vector_gate)
+        self.W_e_out = GVP(edge_h_dim, (edge_out_nf, 0), activations=(None, None), vector_gate=vector_gate)
+        # self.W_v_out = nn.Sequential(
+        #     LayerNorm(node_h_dim, learnable_vector_weight=True),
+        #     GVP(node_h_dim, (node_out_nf, 1), activations=(None, None)),
+        # )
+        # self.W_e_out = nn.Sequential(
+        #     LayerNorm(edge_h_dim, learnable_vector_weight=True),
+        #     GVP(edge_h_dim, (edge_out_nf, 0), activations=(None, None))
+        # )
+    def edge_features(self, h, x, edge_index, batch_mask=None, edge_attr=None):
+        """
+        :param h:
+        :param x:
+        :param edge_index:
+        :param batch_mask:
+        :param edge_attr:
+        :return: scalar and vector-valued edge features
+        """
+        row, col = edge_index
+        coord_diff = x[row] - x[col]
+        dist = coord_diff.norm(dim=-1)
+        rbf = _rbf(dist, D_max=self.d_max, D_count=self.num_rbf,
+                   device=x.device)
+        edge_s = torch.cat([h[row], h[col], rbf], dim=1)
+        edge_v = _normalize(coord_diff).unsqueeze(-2)
+        if edge_attr is not None:
+            edge_s = torch.cat([edge_s, edge_attr], dim=1)
+        if not self.reflection_equiv:
+            mean = scatter_mean(x, batch_mask, dim=0,
+                                dim_size=batch_mask.max() + 1)
+            row, col = edge_index
+            cross = torch.cross(x[row] - mean[batch_mask[row]],
+                                x[col] - mean[batch_mask[col]], dim=1)
+            cross = _normalize(cross).unsqueeze(-2)
+            edge_v = torch.cat([edge_v, cross], dim=-2)
+        return torch.nan_to_num(edge_s), torch.nan_to_num(edge_v)
+    def forward(self, h, x, edge_index, v=None, batch_mask=None, edge_attr=None):
+        bs = len(batch_mask.unique())
+        # h_v = (h, x.unsqueeze(-2))
+        h_v = h if v is None else (h, v)
+        h_e = self.edge_features(h, x, edge_index, batch_mask, edge_attr)
+        h_v = self.W_v(h_v)
+        h_e = self.W_e(h_e)
+        h_y = (torch.zeros(bs, self.dy[0], device=h.device),
+               torch.zeros(bs, self.dy[1], 3, device=h.device))
+        for layer in self.layers:
+            h_v, h_e, h_y = layer(h_v, edge_index, batch_mask, h_e, h_y)
+        # h, x = self.W_v_out(h_v)
+        # x = x.squeeze(-2)
+        h, vel = self.W_v_out(h_v)
+        # x = x + vel.squeeze(-2)
+        edge_attr = self.W_e_out(h_e)
+        # return h, x, edge_attr
+        return h, vel.squeeze(-2), edge_attr
+if __name__ == "__main__":
+    from src.model.gvp import randn
+    from scipy.spatial.transform import Rotation
+    def test_equivariance(model, nodes, edges, glob_feat):
+        random = torch.as_tensor(Rotation.random().as_matrix(),
+                                 dtype=torch.float32, device=device)
+        with torch.no_grad():
+            X_out, E_out, y_out = model(nodes, edges, glob_feat)
+            n_v_rot, e_v_rot, y_v_rot = nodes[1] @ random, edges[1] @ random, glob_feat[1] @ random
+            X_out_v_rot = X_out[1] @ random
+            E_out_v_rot = E_out[1] @ random
+            y_out_v_rot = y_out[1] @ random
+            X_out_prime, E_out_prime, y_out_prime = model((nodes[0], n_v_rot), (edges[0], e_v_rot), (glob_feat[0], y_v_rot))
+            assert torch.allclose(X_out[0], X_out_prime[0], atol=1e-5, rtol=1e-4)
+            assert torch.allclose(X_out_v_rot, X_out_prime[1], atol=1e-5, rtol=1e-4)
+            assert torch.allclose(E_out[0], E_out_prime[0], atol=1e-5, rtol=1e-4)
+            assert torch.allclose(E_out_v_rot, E_out_prime[1], atol=1e-5, rtol=1e-4)
+            assert torch.allclose(y_out[0], y_out_prime[0], atol=1e-5, rtol=1e-4)
+            assert torch.allclose(y_out_v_rot, y_out_prime[1], atol=1e-5, rtol=1e-4)
+            print("SUCCESS")
+    n_nodes = 300
+    n_edges = 10000
+    batch_size = 6
+    node_dim = (16, 8)
+    edge_dim = (8, 4)
+    global_dim = (4, 2)
+    dk = (6, 3)
+    dv = (7, 4)
+    de = (5, 2)
+    db = 10
+    attn_heads = 9
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    nodes = randn(n_nodes, node_dim, device=device)
+    edges = randn(n_edges, edge_dim, device=device)
+    glob_feat = randn(batch_size, global_dim, device=device)
+    edge_index = torch.randint(0, n_nodes, (2, n_edges), device=device)
+    batch_idx = torch.randint(0, batch_size, (n_nodes,), device=device)
+    model = GVPTransformerLayer(node_dim, edge_dim, global_dim, dk, dv, de, db,
+                                attn_heads, n_feedforward = 2,
+                                drop_rate = 0.1).to(device).eval()
+    model_fn = lambda h_V, h_E, h_y: model(h_V, edge_index, batch_idx, h_E, h_y)
+    test_equivariance(model_fn, nodes, edges, glob_feat)

src/model/lightning.py ADDED Viewed

	@@ -0,0 +1,1426 @@

+import warnings
+import tempfile
+from typing import Optional, Union
+from time import time
+from pathlib import Path
+from functools import partial
+from itertools import accumulate
+from argparse import Namespace
+import numpy as np
+import pandas as pd
+from rdkit import Chem
+import torch
+from torch.utils.data import DataLoader, SubsetRandomSampler
+from torch.distributions.categorical import Categorical
+import pytorch_lightning as pl
+from torch_scatter import scatter_mean
+import src.utils as utils
+from src.constants import atom_encoder, atom_decoder, aa_encoder, aa_decoder, \
+    bond_encoder, bond_decoder, residue_encoder, residue_bond_encoder, \
+    residue_decoder, residue_bond_decoder, aa_atom_index, aa_atom_mask
+from src.data.dataset import ProcessedLigandPocketDataset, ClusteredDataset, get_wds
+from src.data import data_utils
+from src.data.data_utils import AppendVirtualNodesInCoM, center_data, Residues, TensorDict, randomize_tensors
+from src.model.flows import CoordICFM, TorusICFM, CoordICFMPredictFinal, TorusICFMPredictFinal, SO3ICFM
+from src.model.markov_bridge import UniformPriorMarkovBridge, MarginalPriorMarkovBridge
+from src.model.dynamics import Dynamics
+from src.model.dynamics_hetero import DynamicsHetero
+from src.model.diffusion_utils import DistributionNodes
+from src.model.loss_utils import TimestepWeights, clash_loss
+from src.analysis.visualization_utils import pocket_to_rdkit, mols_to_pdbfile
+from src.analysis.metrics import MoleculeValidity, CategoricalDistribution, MolecularProperties
+from src.data.molecule_builder import build_molecule
+from src.data.postprocessing import process_all
+from src.sbdd_metrics.metrics import FullEvaluator
+from src.sbdd_metrics.evaluation import VALIDITY_METRIC_NAME, aggregated_metrics, collection_metrics
+from tqdm import tqdm
+# derive additional constants
+aa_atom_mask_tensor = torch.tensor([aa_atom_mask[aa] for aa in aa_decoder])
+aa_atom_decoder = {aa: {v: k for k, v in aa_atom_index[aa].items()} for aa in aa_decoder}
+aa_atom_type_tensor = torch.tensor([[atom_encoder.get(aa_atom_decoder[aa].get(i, '-')[0], -42)
+                                     for i in range(14)] for aa in aa_decoder])
+def set_default(namespace, key, default_val):
+    val = vars(namespace).get(key, default_val)
+    setattr(namespace, key, val)
+class DrugFlow(pl.LightningModule):
+    def __init__(
+            self,
+            pocket_representation: str,
+            train_params: Namespace,
+            loss_params: Namespace,
+            eval_params: Namespace,
+            predictor_params: Namespace,
+            simulation_params: Namespace,
+            virtual_nodes: Union[list, None],
+            flexible: bool,
+            flexible_bb: bool = False,
+            debug: bool = False,
+            overfit: bool = False,
+    ):
+        super(DrugFlow, self).__init__()
+        self.save_hyperparameters()
+        # Set default parameters
+        set_default(train_params, "sharded_dataset", False)
+        set_default(train_params, "sample_from_clusters", False)
+        set_default(train_params, "lr_step_size", None)
+        set_default(train_params, "lr_gamma", None)
+        set_default(train_params, "gnina", None)
+        set_default(loss_params, "lambda_x", 1.0)
+        set_default(loss_params, "lambda_clash", None)
+        set_default(loss_params, "reduce", "mean")
+        set_default(loss_params, "regularize_uncertainty", None)
+        set_default(eval_params, "n_loss_per_sample", 1)
+        set_default(eval_params, "n_sampling_steps", simulation_params.n_steps)
+        set_default(predictor_params, "transform_sc_pred", False)
+        set_default(predictor_params, "add_chi_as_feature", False)
+        set_default(predictor_params, "augment_residue_sc", False)
+        set_default(predictor_params, "augment_ligand_sc", False)
+        set_default(predictor_params, "add_all_atom_diff", False)
+        set_default(predictor_params, "angle_act_fn", None)
+        set_default(simulation_params, "predict_confidence", False)
+        set_default(simulation_params, "predict_final", False)
+        set_default(simulation_params, "scheduler_chi", None)
+        # Check for invalid configurations
+        assert pocket_representation in {'side_chain_bead', 'CA+'}
+        self.pocket_representation = pocket_representation
+        assert flexible or not predictor_params.augment_residue_sc
+        self.augment_residue_sc = predictor_params.augment_residue_sc \
+            if 'augment_residue_sc' in predictor_params else False
+        self.augment_ligand_sc = predictor_params.augment_ligand_sc \
+            if 'augment_ligand_sc' in predictor_params else False
+        assert not (flexible_bb and predictor_params.normal_modes), \
+            "Normal mode eigenvectors are only meaningful for fixed backbones"
+        assert (not flexible_bb) or flexible, \
+            "Currently atom vectors aren't updated if flexible=False"
+        assert not (simulation_params.predict_confidence and
+                    (not predictor_params.heterogeneous_graph or simulation_params.predict_final))
+        # Set parameters
+        self.train_dataset = None
+        self.val_dataset = None
+        self.test_dataset = None
+        self.virtual_nodes = virtual_nodes
+        self.flexible = flexible
+        self.flexible_bb = flexible_bb
+        self.debug = debug
+        self.overfit = overfit
+        self.predict_confidence = simulation_params.predict_confidence
+        if self.virtual_nodes:
+            self.add_virtual_min = virtual_nodes[0]
+            self.add_virtual_max = virtual_nodes[1]
+        # Training parameters
+        self.datadir = train_params.datadir
+        self.receptor_dir = train_params.datadir
+        self.batch_size = train_params.batch_size
+        self.lr = train_params.lr
+        self.lr_step_size = train_params.lr_step_size
+        self.lr_gamma = train_params.lr_gamma
+        self.num_workers = train_params.num_workers
+        self.sample_from_clusters = train_params.sample_from_clusters
+        self.sharded_dataset = train_params.sharded_dataset
+        self.clip_grad = train_params.clip_grad
+        if self.clip_grad:
+            self.gradnorm_queue = utils.Queue()
+            # Add large value that will be flushed.
+            self.gradnorm_queue.add(3000)
+        # Evaluation parameters
+        self.outdir = eval_params.outdir
+        self.eval_batch_size = eval_params.eval_batch_size
+        self.eval_epochs = eval_params.eval_epochs
+        # assert eval_params.visualize_sample_epoch % self.eval_epochs == 0
+        self.visualize_sample_epoch = eval_params.visualize_sample_epoch
+        self.visualize_chain_epoch = eval_params.visualize_chain_epoch
+        self.sample_with_ground_truth_size = eval_params.sample_with_ground_truth_size
+        self.n_loss_per_sample = eval_params.n_loss_per_sample
+        self.n_eval_samples = eval_params.n_eval_samples
+        self.n_visualize_samples = eval_params.n_visualize_samples
+        self.keep_frames = eval_params.keep_frames
+        self.gnina = train_params.gnina
+        # Feature encoders/decoders
+        self.atom_encoder = atom_encoder
+        self.atom_decoder = atom_decoder
+        self.bond_encoder = bond_encoder
+        self.bond_decoder = bond_decoder
+        self.aa_encoder = aa_encoder
+        self.aa_decoder = aa_decoder
+        self.residue_encoder = residue_encoder
+        self.residue_decoder = residue_decoder
+        self.residue_bond_encoder = residue_bond_encoder
+        self.residue_bond_decoder = residue_bond_decoder
+        self.atom_nf = len(self.atom_decoder)
+        self.residue_nf = len(self.aa_decoder)
+        if self.pocket_representation == 'side_chain_bead':
+            self.residue_nf += len(self.residue_encoder)
+        if self.pocket_representation == 'CA+':
+            self.aa_atom_index = aa_atom_index
+            self.n_atom_aa = max([x for aa in aa_atom_index.values() for x in aa.values()]) + 1
+            self.residue_nf = (self.residue_nf, self.n_atom_aa)  # (s, V)
+        self.bond_nf = len(self.bond_decoder)
+        self.pocket_bond_nf = len(self.residue_bond_decoder)
+        self.x_dim = 3
+        # Set up the neural network
+        self.dynamics = self.init_model(predictor_params)
+        # Initialize objects for each variable type
+        if simulation_params.predict_final:
+            self.module_x = CoordICFMPredictFinal(None)
+            self.module_chi = TorusICFMPredictFinal(None, 5) if self.flexible else None
+            if self.flexible_bb:
+                raise NotImplementedError()
+        else:
+            self.module_x = CoordICFM(None)
+            # self.module_chi = AngleICFM(None, 5) if self.flexible else None
+            scheduler_args = None if simulation_params.scheduler_chi is None else vars(simulation_params.scheduler_chi)
+            self.module_chi = TorusICFM(None, 5, scheduler_args) if self.flexible else None
+            self.module_trans = CoordICFM(None) if self.flexible_bb else None
+            self.module_rot = SO3ICFM(None) if self.flexible_bb else None
+        if simulation_params.prior_h == 'uniform':
+            self.module_h = UniformPriorMarkovBridge(self.atom_nf, loss_type=loss_params.discrete_loss)
+        elif simulation_params.prior_h == 'marginal':
+            self.register_buffer('prior_h', self.get_categorical_prop('atom'))  # add to module
+            self.module_h = MarginalPriorMarkovBridge(self.atom_nf, self.prior_h, loss_type=loss_params.discrete_loss)
+        if simulation_params.prior_e == 'uniform':
+            self.module_e = UniformPriorMarkovBridge(self.bond_nf, loss_type=loss_params.discrete_loss)
+        elif simulation_params.prior_e == 'marginal':
+            self.register_buffer('prior_e', self.get_categorical_prop('bond'))  # add to module
+            self.module_e = MarginalPriorMarkovBridge(self.bond_nf, self.prior_e, loss_type=loss_params.discrete_loss)
+        # Loss parameters
+        self.loss_reduce = loss_params.reduce
+        self.lambda_x = loss_params.lambda_x
+        self.lambda_h = loss_params.lambda_h
+        self.lambda_e = loss_params.lambda_e
+        self.lambda_chi = loss_params.lambda_chi if self.flexible else None
+        self.lambda_trans = loss_params.lambda_trans if self.flexible_bb else None
+        self.lambda_rot = loss_params.lambda_rot if self.flexible_bb else None
+        self.lambda_clash = loss_params.lambda_clash
+        self.regularize_uncertainty = loss_params.regularize_uncertainty
+        if loss_params.timestep_weights is not None:
+            weight_type = loss_params.timestep_weights.split('_')[0]
+            kwargs = loss_params.timestep_weights.split('_')[1:]
+            kwargs = {x.split('=')[0]: float(x.split('=')[1]) for x in kwargs}
+            self.timestep_weights = TimestepWeights(weight_type, **kwargs)
+        else:
+            self.timestep_weights = None
+        # Sampling
+        self.T_sampling = eval_params.n_sampling_steps
+        self.train_step_size = 1 / simulation_params.n_steps
+        self.size_distribution = None  # initialized only if needed
+        # Metrics, initialized only if needed
+        self.train_smiles = None
+        self.ligand_metrics = None
+        self.molecule_properties = None
+        self.evaluator = None
+        self.ligand_atom_type_distribution = None
+        self.ligand_bond_type_distribution = None
+        # containers for metric aggregation
+        self.training_step_outputs = []
+        self.validation_step_outputs = []
+    def on_load_checkpoint(self, checkpoint):
+        """
+        This hook is only used for backward compatibility with checkpoints that
+        did not save prior_h and prior_e in state_dict in the past
+        """
+        if hasattr(self, "prior_h") and "prior_h" not in checkpoint["state_dict"]:
+            checkpoint["state_dict"]["prior_h"] = self.get_categorical_prop('atom')
+        if hasattr(self, "prior_e") and "prior_e" not in checkpoint["state_dict"]:
+            checkpoint["state_dict"]["prior_e"] = self.get_categorical_prop('bond')
+        if "prior_e" in checkpoint["state_dict"] and not hasattr(self, "prior_e"):
+            # NOTE: a very exotic case that happened to one model. Potentially can be removed in the future
+            self.register_buffer("prior_e", self.get_categorical_prop('bond'))
+    def init_model(self, predictor_params):
+        model_type = predictor_params.backbone
+        if 'heterogeneous_graph' in predictor_params and predictor_params.heterogeneous_graph:
+            return DynamicsHetero(
+                atom_nf=self.atom_nf,
+                residue_nf=self.residue_nf,
+                bond_dict=self.bond_encoder,
+                pocket_bond_dict=self.residue_bond_encoder,
+                model=model_type,
+                num_rbf_time=predictor_params.__dict__.get('num_rbf_time'),
+                model_params=getattr(predictor_params, model_type + '_params'),
+                edge_cutoff_ligand=predictor_params.edge_cutoff_ligand,
+                edge_cutoff_pocket=predictor_params.edge_cutoff_pocket,
+                edge_cutoff_interaction=predictor_params.edge_cutoff_interaction,
+                predict_angles=self.flexible,
+                predict_frames=self.flexible_bb,
+                add_cycle_counts=predictor_params.cycle_counts,
+                add_spectral_feat=predictor_params.spectral_feat,
+                add_nma_feat=predictor_params.normal_modes,
+                reflection_equiv=predictor_params.reflection_equivariant,
+                d_max=predictor_params.d_max,
+                num_rbf_dist=predictor_params.num_rbf,
+                self_conditioning=predictor_params.self_conditioning,
+                augment_residue_sc=self.augment_residue_sc,
+                augment_ligand_sc=self.augment_ligand_sc,
+                add_chi_as_feature=predictor_params.add_chi_as_feature,
+                angle_act_fn=predictor_params.angle_act_fn,
+                add_all_atom_diff=predictor_params.add_all_atom_diff,
+                predict_confidence=self.predict_confidence,
+            )
+        else:
+            if predictor_params.__dict__.get('num_rbf_time') is not None:
+                raise NotImplementedError("RBF time embedding not yet implemented")
+            return Dynamics(
+                atom_nf=self.atom_nf,
+                residue_nf=self.residue_nf,
+                joint_nf=predictor_params.joint_nf,
+                bond_dict=self.bond_encoder,
+                pocket_bond_dict=self.residue_bond_encoder,
+                edge_nf=predictor_params.edge_nf,
+                hidden_nf=predictor_params.hidden_nf,
+                model=model_type,
+                model_params=getattr(predictor_params, model_type + '_params'),
+                edge_cutoff_ligand=predictor_params.edge_cutoff_ligand,
+                edge_cutoff_pocket=predictor_params.edge_cutoff_pocket,
+                edge_cutoff_interaction=predictor_params.edge_cutoff_interaction,
+                predict_angles=self.flexible,
+                predict_frames=self.flexible_bb,
+                add_cycle_counts=predictor_params.cycle_counts,
+                add_spectral_feat=predictor_params.spectral_feat,
+                add_nma_feat=predictor_params.normal_modes,
+                self_conditioning=predictor_params.self_conditioning,
+                augment_residue_sc=self.augment_residue_sc,
+                augment_ligand_sc=self.augment_ligand_sc,
+                add_chi_as_feature=predictor_params.add_chi_as_feature,
+                angle_act_fn=predictor_params.angle_act_fn,
+            )
+    def _load_histogram(self, type):
+        """
+        Load empirical categorical distributions of atom or bond types from disk.
+        Returns None if the required file is not found.
+        """
+        assert type in {"atom", "bond"}
+        filename = 'ligand_type_histogram.npy' if type == 'atom' else 'ligand_bond_type_histogram.npy'
+        encoder = self.atom_encoder if type == 'atom' else self.bond_encoder
+        hist_file = Path(self.datadir, filename)
+        if not hist_file.exists():
+            return None
+        hist = np.load(hist_file, allow_pickle=True).item()
+        return CategoricalDistribution(hist, encoder)
+    def get_categorical_prop(self, type):
+        hist = self._load_histogram(type)
+        encoder = self.atom_encoder if type == 'atom' else self.bond_encoder
+        # Note: default value ensures that code will crash if prior is not
+        #  read from disk or loaded from checkpoint later on
+        return torch.zeros(len(encoder)) * float("nan") if hist is None else torch.tensor(hist.p)
+    def configure_optimizers(self):
+        optimizers = [
+            torch.optim.AdamW(self.parameters(), lr=self.lr, amsgrad=True, weight_decay=1e-12),
+        ]
+        if self.lr_step_size is None or self.lr_gamma is None:
+            lr_schedulers = []
+        else:
+            lr_schedulers = [
+                torch.optim.lr_scheduler.StepLR(optimizers[0], step_size=self.lr_step_size, gamma=self.lr_gamma),
+            ]
+        return optimizers, lr_schedulers
+    def setup(self, stage: Optional[str] = None):
+        self.setup_sampling()
+        if stage == 'fit':
+            self.train_dataset = self.get_dataset(stage='train')
+            self.val_dataset = self.get_dataset(stage='val')
+            self.setup_metrics()
+        elif stage == 'val':
+            self.val_dataset = self.get_dataset(stage='val')
+            self.setup_metrics()
+        elif stage == 'test':
+            self.test_dataset = self.get_dataset(stage='test')
+            self.setup_metrics()
+        elif stage == 'generation':
+            pass
+        else:
+            raise NotImplementedError
+    def get_dataset(self, stage, pocket_transform=None):
+        # when sampling we don't append virtual nodes as we might need access to the ground truth size
+        if self.virtual_nodes and stage == "train":
+            ligand_transform = AppendVirtualNodesInCoM(
+                atom_encoder, bond_encoder, add_min=self.add_virtual_min, add_max=self.add_virtual_max)
+        else:
+            ligand_transform = None
+        # we want to know if something goes wrong on the validation or test set
+        catch_errors = stage == "train"
+        if self.sharded_dataset:
+            return get_wds(
+                data_path=self.datadir,
+                stage='val' if self.debug else stage,
+                ligand_transform=ligand_transform,
+                pocket_transform=pocket_transform,
+            )
+        if self.sample_from_clusters and stage == "train":  # val/test should be deterministic
+            return ClusteredDataset(
+                pt_path=Path(self.datadir, 'val.pt' if self.debug else f'{stage}.pt'),
+                ligand_transform=ligand_transform,
+                pocket_transform=pocket_transform,
+                catch_errors=catch_errors
+            )
+        return ProcessedLigandPocketDataset(
+            pt_path=Path(self.datadir, 'val.pt' if self.debug else f'{stage}.pt'),
+            ligand_transform=ligand_transform,
+            pocket_transform=pocket_transform,
+            catch_errors=catch_errors
+        )
+    def setup_sampling(self):
+        # distribution of nodes
+        histogram_file = Path(self.datadir, 'size_distribution.npy')  # TODO: store this in model checkpoint so that we can sample without this file
+        size_histogram = np.load(histogram_file).tolist()
+        self.size_distribution = DistributionNodes(size_histogram)
+    def setup_metrics(self):
+        # For metrics
+        smiles_file = Path(self.datadir, 'train_smiles.npy')
+        self.train_smiles = None if not smiles_file.exists() else np.load(smiles_file)
+        self.ligand_metrics = MoleculeValidity()
+        self.molecule_properties = MolecularProperties()
+        self.evaluator = FullEvaluator(gnina=self.gnina, exclude_evaluators=['geometry', 'ring_count'])
+        self.ligand_atom_type_distribution = self._load_histogram('atom')
+        self.ligand_bond_type_distribution = self._load_histogram('bond')
+    def train_dataloader(self):
+        shuffle = None if self.overfit else False if self.sharded_dataset else True
+        return DataLoader(self.train_dataset, self.batch_size, shuffle=shuffle,
+                          sampler=SubsetRandomSampler([0]) if self.overfit else None,
+                          num_workers=self.num_workers,
+                          collate_fn=self.train_dataset.collate_fn,
+                          # collate_fn=partial(self.train_dataset.collate_fn, ligand_transform=batch_transform),
+                          pin_memory=True)
+    def val_dataloader(self):
+        if self.overfit:
+            return self.train_dataloader()
+        return DataLoader(self.val_dataset, self.eval_batch_size,
+                          shuffle=False, num_workers=self.num_workers,
+                          collate_fn=self.val_dataset.collate_fn,
+                          pin_memory=True)
+    def test_dataloader(self):
+        return DataLoader(self.test_dataset, self.eval_batch_size, shuffle=False,
+                          num_workers=self.num_workers,
+                          collate_fn=self.test_dataset.collate_fn,
+                          pin_memory=True)
+    def log_metrics(self, metrics_dict, split, batch_size=None, **kwargs):
+        for m, value in metrics_dict.items():
+            self.log(f'{m}/{split}', value, batch_size=batch_size, **kwargs)
+    def aggregate_metrics(self, step_outputs, prefix):
+        if 'timestep' in step_outputs[0]:
+            timesteps = torch.cat([x['timestep'] for x in step_outputs]).squeeze()
+        if 'loss_per_sample' in step_outputs[0]:
+            losses = torch.cat([x['loss_per_sample'] for x in step_outputs])
+            pearson_corr = torch.corrcoef(torch.stack([timesteps, losses], dim=0))[0, 1]
+            self.log(f'corr_loss_timestep/{prefix}', pearson_corr, prog_bar=False)
+        if 'eps_hat_norm' in step_outputs[0]:
+            eps_norm = torch.cat([x['eps_hat_norm'] for x in step_outputs])
+            pearson_corr = torch.corrcoef(torch.stack([timesteps, eps_norm], dim=0))[0, 1]
+            self.log(f'corr_eps_timestep/{prefix}', pearson_corr, prog_bar=False)
+    def on_train_epoch_end(self):
+        self.aggregate_metrics(self.training_step_outputs, 'train')
+        self.training_step_outputs.clear()
+    # TODO: doesn't work in multi-GPU mode
+    # def on_before_batch_transfer(self, batch, dataloader_idx):
+    #     """
+    #     Performs operations on data before it is transferred to the GPU.
+    #     Hence, supports multiple dataloaders for speedup.
+    #     """
+    #     batch['pocket'] = Residues(**batch['pocket'])
+    #     return batch
+    # # TODO: try if this is compatible with DDP
+    # def on_after_batch_transfer(self, batch, dataloader_idx):
+    #     """
+    #     Performs operations on data after it is transferred to the GPU.
+    #     """
+    #     batch['pocket'] = Residues(**batch['pocket'])
+    #     batch['ligand'] = TensorDict(**batch['ligand'])
+    #     return batch
+    def get_sc_transform_fn(self, zt_chi, zt_x, t, z0_chi, ligand_mask, pocket):
+        sc_transform = {}
+        if self.augment_residue_sc:
+            def pred_all_atom(pred_chi, pred_trans=None, pred_rot=None):
+                temp_pocket = pocket.deepcopy()
+                if pred_trans is not None and pred_rot is not None:
+                    zt_trans = pocket['x']
+                    zt_rot = pocket['axis_angle']
+                    z1_trans_pred = self.module_trans.get_z1_given_zt_and_pred(
+                        zt_trans, pred_trans, None, t, pocket['mask'])
+                    z1_rot_pred = self.module_rot.get_z1_given_zt_and_pred(
+                        zt_rot, pred_rot, None, t, pocket['mask'])
+                    temp_pocket.set_frame(z1_trans_pred, z1_rot_pred)
+                z1_chi_pred = self.module_chi.get_z1_given_zt_and_pred(
+                    zt_chi[..., :5], pred_chi, z0_chi, t, pocket['mask'])
+                temp_pocket.set_chi(z1_chi_pred)
+                all_coord = temp_pocket['v'] + temp_pocket['x'].unsqueeze(1)
+                return all_coord - pocket['x'].unsqueeze(1)
+            sc_transform['residues'] = pred_all_atom
+        if self.augment_ligand_sc:
+            # sc_transform['atoms'] = partial(self.module_x.get_z1_given_zt_and_pred, zt=zs_x, z0=None, t=t, batch_mask=lig_mask)
+            sc_transform['atoms'] = lambda pred: (self.module_x.get_z1_given_zt_and_pred(
+                zt_x, pred.squeeze(1), None, t, ligand_mask) - zt_x).unsqueeze(1)
+        return sc_transform
+    def compute_loss(self, ligand, pocket, return_info=False):
+        """
+        Samples time steps and computes network predictions
+        """
+        # TODO: move somewhere else (like collate_fn)
+        pocket = Residues(**pocket)
+        # Center sample
+        ligand, pocket = center_data(ligand, pocket)
+        if pocket['x'].numel() > 0:
+            pocket_com = scatter_mean(pocket['x'], pocket['mask'], dim=0)
+        else:
+            pocket_com = scatter_mean(ligand['x'], ligand['mask'], dim=0)
+        # # Normalize pocket coordinates
+        # pocket['x'] = self.module_x.normalize(pocket['x'])
+        # Sample a timestep t for each example in batch
+        t = torch.rand(ligand['size'].size(0), device=ligand['x'].device).unsqueeze(-1)
+        # Noise
+        z0_x = self.module_x.sample_z0(pocket_com, ligand['mask'])
+        z0_h = self.module_h.sample_z0(ligand['mask'])
+        z0_e = self.module_e.sample_z0(ligand['bond_mask'])
+        zt_x = self.module_x.sample_zt(z0_x, ligand['x'], t, ligand['mask'])
+        zt_h = self.module_h.sample_zt(z0_h, ligand['one_hot'], t, ligand['mask'])
+        zt_e = self.module_e.sample_zt(z0_e, ligand['bond_one_hot'], t, ligand['bond_mask'])
+        if self.flexible_bb:
+            z0_trans = self.module_trans.sample_z0(pocket_com, pocket['mask'])
+            z1_trans = pocket['x'].detach().clone()
+            zt_trans = self.module_trans.sample_zt(z0_trans, z1_trans, t, pocket['mask'])
+            z0_rot = self.module_rot.sample_z0(pocket['mask'])
+            z1_rot = pocket['axis_angle'].detach().clone()
+            zt_rot = self.module_rot.sample_zt(z0_rot, z1_rot, t, pocket['mask'])
+            # update pocket
+            pocket.set_frame(zt_trans, zt_rot)
+        z0_chi, zt_chi = None, None
+        if self.flexible:
+            # residues = [data_utils.residue_from_internal_coord(ic) for ic in pocket['residues']]
+            # residues = pocket['residues']
+            # z1_chi = torch.stack([data_utils.get_torsion_angles(r, device=self.device) for r in residues], dim=0)
+            z1_chi = pocket['chi'][:, :5].detach().clone()
+            z0_chi = self.module_chi.sample_z0(pocket['mask'])
+            zt_chi = self.module_chi.sample_zt(z0_chi, z1_chi, t, pocket['mask'])
+            # internal to external coordinates
+            pocket.set_chi(zt_chi)
+        if pocket['x'].numel() == 0:
+            pocket.set_empty_v()
+        # Predict denoising
+        sc_transform = self.get_sc_transform_fn(zt_chi, zt_x, t, z0_chi, ligand['mask'], pocket)
+        # sc_transform = None
+        pred_ligand, pred_residues = self.dynamics(
+            zt_x, zt_h, ligand['mask'], pocket, t,
+            bonds_ligand=(ligand['bonds'], zt_e), sc_transform=sc_transform
+        )
+        # Compute L2 loss
+        if self.predict_confidence:
+            loss_x = self.module_x.compute_loss(pred_ligand['vel'], z0_x, ligand['x'], t, ligand['mask'], reduce='none')
+            # compute confidence regularization
+            k = self.module_x.dim  # pred.size(-1)
+            sigma = pred_ligand['uncertainty_vel']
+            loss_x = loss_x / (2 * sigma ** 2) + k * torch.log(sigma)
+            if self.regularize_uncertainty is not None:
+                loss_x = loss_x + self.regularize_uncertainty * (pred_ligand['uncertainty_vel'] - 1) ** 2
+            loss_x = self.module_x.reduce_loss(loss_x, ligand['mask'], reduce=self.loss_reduce)
+        else:
+            loss_x = self.module_x.compute_loss(pred_ligand['vel'], z0_x, ligand['x'], t, ligand['mask'], reduce=self.loss_reduce)
+        # Loss for categorical variables
+        t_next = torch.clamp(t + self.train_step_size, max=1.0)
+        loss_h = self.module_h.compute_loss(pred_ligand['logits_h'], zt_h, ligand['one_hot'], ligand['mask'], t, t_next, reduce=self.loss_reduce)
+        loss_e = self.module_e.compute_loss(pred_ligand['logits_e'], zt_e, ligand['bond_one_hot'], ligand['bond_mask'], t, t_next, reduce=self.loss_reduce)
+        loss = self.lambda_x * loss_x + self.lambda_h * loss_h + self.lambda_e * loss_e
+        if self.flexible:
+            loss_chi = self.module_chi.compute_loss(pred_residues['chi'], z0_chi, z1_chi, zt_chi, t, pocket['mask'], reduce=self.loss_reduce)
+            loss = loss + self.lambda_chi * loss_chi
+        if self.flexible_bb:
+            loss_trans = self.module_trans.compute_loss(pred_residues['trans'], z0_trans, z1_trans, t, pocket['mask'], reduce=self.loss_reduce)
+            loss_rot = self.module_rot.compute_loss(pred_residues['rot'], z0_rot, z1_rot, zt_rot, t, pocket['mask'], reduce=self.loss_reduce)
+            loss = loss + self.lambda_trans * loss_trans + self.lambda_rot * loss_rot
+        if self.lambda_clash is not None and self.lambda_clash > 0:
+            if self.flexible_bb:
+                pred_z1_trans = self.module_trans.get_z1_given_zt_and_pred(zt_trans, pred_residues['trans'], z0_trans, t, pocket['mask'])
+                pred_z1_rot = self.module_rot.get_z1_given_zt_and_pred(zt_rot, pred_residues['rot'], z0_rot, t, pocket['mask'])
+                pocket.set_frame(pred_z1_trans, pred_z1_rot)
+            if self.flexible:
+                # internal to external coordinates
+                pred_z1_chi = self.module_chi.get_z1_given_zt_and_pred(zt_chi, pred_residues['chi'], z0_chi, t, pocket['mask'])
+                pocket.set_chi(pred_z1_chi)
+            pocket_coord = pocket['x'].unsqueeze(1) + pocket['v']
+            pocket_types = aa_atom_type_tensor[pocket['one_hot'].argmax(dim=-1)]
+            pocket_mask = pocket['mask'].unsqueeze(-1).repeat((1, pocket['v'].size(1)))
+            # Extract only existing atoms
+            atom_mask = aa_atom_mask_tensor[pocket['one_hot'].argmax(dim=-1)]
+            pocket_coord = pocket_coord[atom_mask]
+            pocket_types = pocket_types[atom_mask]
+            pocket_mask = pocket_mask[atom_mask]
+            # pred_z1_x = pred_x + z0_x
+            pred_z1_x = self.module_x.get_z1_given_zt_and_pred(zt_x, pred_ligand['vel'], z0_x, t, ligand['mask'])
+            pred_z1_h = pred_ligand['logits_h'].argmax(dim=-1)
+            loss_clash = clash_loss(pred_z1_x, pred_z1_h, ligand['mask'],
+                                    pocket_coord, pocket_types, pocket_mask)
+            loss = loss + self.lambda_clash * loss_clash
+        if self.timestep_weights is not None:
+            w_t = self.timestep_weights(t).squeeze()
+            loss = w_t * loss
+        loss = loss.mean(0)
+        info = {
+            'loss_x': loss_x.mean().item(),
+            'loss_h': loss_h.mean().item(),
+            'loss_e': loss_e.mean().item(),
+        }
+        if self.flexible:
+            info['loss_chi'] = loss_chi.mean().item()
+        if self.flexible_bb:
+            info['loss_trans'] = loss_trans.mean().item()
+            info['loss_rot'] = loss_rot.mean().item()
+        if self.lambda_clash is not None:
+            info['loss_clash'] = loss_clash.mean().item()
+        if self.predict_confidence:
+            sigma_x_mol = scatter_mean(pred_ligand['uncertainty_vel'], ligand['mask'], dim=0)
+            info['pearson_sigma_x'] = torch.corrcoef(torch.stack([sigma_x_mol.detach(), t.squeeze()]))[0, 1].item()
+            info['mean_sigma_x'] = sigma_x_mol.mean().item()
+            entropy_h = Categorical(logits=pred_ligand['logits_h']).entropy()
+            entropy_h_mol = scatter_mean(entropy_h, ligand['mask'], dim=0)
+            info['pearson_entropy_h'] = torch.corrcoef(torch.stack([entropy_h_mol.detach(), t.squeeze()]))[0, 1].item()
+            info['mean_entropy_h'] = entropy_h_mol.mean().item()
+            entropy_e = Categorical(logits=pred_ligand['logits_e']).entropy()
+            entropy_e_mol = scatter_mean(entropy_e, ligand['bond_mask'], dim=0)
+            info['pearson_entropy_e'] = torch.corrcoef(torch.stack([entropy_e_mol.detach(), t.squeeze()]))[0, 1].item()
+            info['mean_entropy_e'] = entropy_e_mol.mean().item()
+        return (loss, info) if return_info else loss
+    def training_step(self, data, *args):
+        ligand, pocket = data['ligand'], data['pocket']
+        try:
+            loss, info = self.compute_loss(ligand, pocket, return_info=True)
+        except RuntimeError as e:
+            # this is not supported for multi-GPU
+            if self.trainer.num_devices < 2 and 'out of memory' in str(e):
+                print('WARNING: ran out of memory, skipping to the next batch')
+                return None
+            else:
+                raise e
+        log_dict = {k: v for k, v in info.items() if isinstance(v, float)
+                    or torch.numel(v) <= 1}
+        # if self.learn_nu:
+        #     log_dict['nu_x'] = self.noise_schedules['x'].nu.item()
+        #     log_dict['nu_h'] = self.noise_schedules['h'].nu.item()
+        #     log_dict['nu_e'] = self.noise_schedules['e'].nu.item()
+        self.log_metrics({'loss': loss, **log_dict}, 'train',
+                         batch_size=len(ligand['size']))
+        out = {'loss': loss, **info}
+        self.training_step_outputs.append(out)
+        return out
+    def validation_step(self, data, *args):
+        # Compute the loss N times and average to get a better estimate
+        loss_list, info_list = [], []
+        self.dynamics.train()  # TODO: this is currently necessary to make self-conditioning work
+        for _ in range(self.n_loss_per_sample):
+            loss, info = self.compute_loss(data['ligand'].copy(),
+                                           data['pocket'].copy(),
+                                           return_info=True)
+            loss_list.append(loss.item())
+            info_list.append(info)
+        self.dynamics.eval()
+        if len(loss_list) >= 1:
+            loss = np.mean(loss_list)
+            info = {k: np.mean([x[k] for x in info_list]) for k in info_list[0]}
+            self.log_metrics({'loss': loss, **info}, 'val', batch_size=len(data['ligand']['size']))
+        # Sample
+        rdmols, rdpockets, _ = self.sample(
+            data=data,
+            n_samples=self.n_eval_samples,
+            num_nodes="ground_truth" if self.sample_with_ground_truth_size else None,
+        )
+        out = {
+            'ligands': rdmols,
+            'pockets': rdpockets,
+            'receptor_files': [Path(self.receptor_dir, 'val', x) for x in data['pocket']['name']]
+        }
+        self.validation_step_outputs.append(out)
+        return out
+    # def test_step(self, data, *args):
+    #     self._shared_eval(data, 'test', *args)
+    def on_validation_epoch_end(self):
+        outdir = Path(self.outdir, f'epoch_{self.current_epoch}')
+        rdmols = [m for x in self.validation_step_outputs for m in x['ligands']]
+        rdpockets = [p for x in self.validation_step_outputs for p in x['pockets']]
+        receptors = [r for x in self.validation_step_outputs for r in x['receptor_files']]
+        self.validation_step_outputs.clear()
+        ligand_atom_types = [atom_encoder[a.GetSymbol()] for m in rdmols for a in m.GetAtoms()]
+        ligand_bond_types = []
+        for m in rdmols:
+            bonds = m.GetBonds()
+            no_bonds = m.GetNumAtoms() * (m.GetNumAtoms() - 1) // 2 - m.GetNumBonds()
+            ligand_bond_types += [bond_encoder['NOBOND']] * no_bonds
+            for b in bonds:
+                ligand_bond_types.append(bond_encoder[b.GetBondType().name])
+        tic = time()
+        results = self.analyze_sample(
+            rdmols, ligand_atom_types, ligand_bond_types, receptors=(rdpockets if len(rdpockets) != 0 else None)
+        )
+        self.log_metrics(results, 'val')
+        print(f'Evaluation took {time() - tic:.2f} seconds')
+        if (self.current_epoch + 1) % self.visualize_sample_epoch == 0:
+            tic = time()
+            outdir.mkdir(exist_ok=True, parents=True)
+            # center for better visualization
+            rdmols = rdmols[:self.n_visualize_samples]
+            rdpockets = rdpockets[:self.n_visualize_samples]
+            for m, p in zip(rdmols, rdpockets):
+                center = m.GetConformer().GetPositions().mean(axis=0)
+                for i in range(m.GetNumAtoms()):
+                    x, y, z = m.GetConformer().GetPositions()[i] - center
+                    m.GetConformer().SetAtomPosition(i, (x, y, z))
+                for i in range(p.GetNumAtoms()):
+                    x, y, z = p.GetConformer().GetPositions()[i] - center
+                    p.GetConformer().SetAtomPosition(i, (x, y, z))
+            # save molecule
+            utils.write_sdf_file(Path(outdir, 'molecules.sdf'), rdmols)
+            # save pocket
+            utils.write_sdf_file(Path(outdir, 'pockets.sdf'), rdpockets)
+            print(f'Sample visualization took {time() - tic:.2f} seconds')
+        if (self.current_epoch + 1) % self.visualize_chain_epoch == 0:
+            tic = time()
+            outdir.mkdir(exist_ok=True, parents=True)
+            if self.sharded_dataset:
+                index = torch.randint(len(self.val_dataset), size=(1,)).item()
+                for i, x in enumerate(self.val_dataset):
+                    if i == index:
+                        break
+                batch = self.val_dataset.collate_fn([x])
+            else:
+                batch = self.val_dataset.collate_fn([self.val_dataset[torch.randint(len(self.val_dataset), size=(1,))]])
+            batch['pocket'] = Residues(**batch['pocket']).to(self.device)
+            pocket_copy = batch['pocket'].copy()
+            if len(batch['pocket']['x']) > 0:
+                ligand_chain, pocket_chain, info = self.sample_chain(batch['pocket'], self.keep_frames)
+            else:
+                num_nodes, _ = self.size_distribution.sample()
+                ligand_chain, pocket_chain, info = self.sample_chain(batch['pocket'], self.keep_frames, num_nodes=num_nodes)
+            # utils.write_sdf_file(Path(outdir, 'chain_pocket.sdf'), pocket_chain)
+            # utils.write_chain(Path(outdir, 'chain_pocket.xyz'), pocket_chain)
+            if self.flexible or self.flexible_bb:
+                # insert ground truth at the beginning so that it's used by PyMOL to determine the connectivity
+                ground_truth_pocket = pocket_to_rdkit(
+                    pocket_copy, self.pocket_representation,
+                    self.atom_encoder, self.atom_decoder,
+                    self.aa_decoder, self.residue_decoder,
+                    self.aa_atom_index
+                )[0]
+                ground_truth_ligand = build_molecule(
+                    batch['ligand']['x'], batch['ligand']['one_hot'].argmax(1),
+                    bonds=batch['ligand']['bonds'],
+                    bond_types=batch['ligand']['bond_one_hot'].argmax(1),
+                    atom_decoder=self.atom_decoder,
+                    bond_decoder=self.bond_decoder
+                )
+                pocket_chain.insert(0, ground_truth_pocket)
+                ligand_chain.insert(0, ground_truth_ligand)
+                # pocket_chain.insert(0, pocket_chain[-1])
+                # ligand_chain.insert(0, ligand_chain[-1])
+            # save molecules
+            utils.write_sdf_file(Path(outdir, 'chain_ligand.sdf'), ligand_chain)
+            # save pocket
+            mols_to_pdbfile(pocket_chain, Path(outdir, 'chain_pocket.pdb'))
+            self.log_metrics(info, 'val')
+            print(f'Chain visualization took {time() - tic:.2f} seconds')
+    # NOTE: temporary fix of this Lightning bug:
+    # https://github.com/Lightning-AI/pytorch-lightning/discussions/18110
+    # Without it resume training has a strange behavior and fails
+    @property
+    def total_batch_idx(self) -> int:
+        """Returns the current batch index (across epochs)"""
+        # use `ready` instead of `completed` in case this is accessed after `completed` has been increased
+        # but before the next `ready` increase
+        return max(0, self.batch_progress.total.ready - 1)
+    @property
+    def batch_idx(self) -> int:
+        """Returns the current batch index (within this epoch)"""
+        # use `ready` instead of `completed` in case this is accessed after `completed` has been increased
+        # but before the next `ready` increase
+        return max(0, self.batch_progress.current.ready - 1)
+    # def analyze_sample(self, rdmols, atom_types, bond_types, aa_types=None, receptors=None):
+    #     out = {}
+    #     # Distribution of node types
+    #     kl_div_atom = self.ligand_atom_type_distribution.kl_divergence(atom_types) \
+    #         if self.ligand_atom_type_distribution is not None else -1
+    #     out['kl_div_atom_types'] = kl_div_atom
+    #     # Distribution of edge types
+    #     kl_div_bond = self.ligand_bond_type_distribution.kl_divergence(bond_types) \
+    #         if self.ligand_bond_type_distribution is not None else -1
+    #     out['kl_div_bond_types'] = kl_div_bond
+    #     if aa_types is not None:
+    #         kl_div_aa = self.pocket_type_distribution.kl_divergence(aa_types) \
+    #             if self.pocket_type_distribution is not None else -1
+    #         out['kl_div_residue_types'] = kl_div_aa
+    #     # Post-process sample
+    #     processed_mols = [process_all(m) for m in rdmols]
+    #     # Other basic metrics
+    #     results = self.ligand_metrics(rdmols)
+    #     out['n_samples'] = results['n_total']
+    #     out['Validity'] = results['validity']
+    #     out['Connectivity'] = results['connectivity']
+    #     out['valid_and_connected'] = results['valid_and_connected']
+    #     # connected_mols = [get_largest_fragment(m) for m in rdmols]
+    #     connected_mols = [process_all(m, largest_frag=True, adjust_aromatic_Ns=False, relax_iter=0) for m in rdmols]
+    #     connected_mols = [m for m in connected_mols if m is not None]
+    #     out.update(self.molecule_properties(connected_mols))
+    #     # Repeat after post-processing
+    #     results = self.ligand_metrics(processed_mols)
+    #     out['validity_processed'] = results['validity']
+    #     out['connectivity_processed'] = results['connectivity']
+    #     out['valid_and_connected_processed'] = results['valid_and_connected']
+    #     processed_mols = [m for m in processed_mols if m is not None]
+    #     for k, v in self.molecule_properties(processed_mols).items():
+    #         out[f"{k}_processed"] = v
+    #     # Simple docking score
+    #     if receptors is not None and self.gnina is not None:
+    #         assert len(receptors) == len(rdmols)
+    #         docking_results = compute_gnina_scores(rdmols, receptors, gnina=self.gnina)
+    #         out.update(docking_results)
+    #     # Clash score
+    #     if receptors is not None:
+    #         assert len(receptors) == len(rdmols)
+    #         clashes = {
+    #             'ligands': [legacy_clash_score(m) for m in rdmols],
+    #             'pockets': [legacy_clash_score(p) for p in receptors],
+    #             'between': [legacy_clash_score(m, p) for m, p in zip(rdmols, receptors)],
+    #             'v2_ligands': [clash_score(m) for m in rdmols],
+    #             'v2_pockets': [clash_score(p) for p in receptors],
+    #             'v2_between': [clash_score(m, p) for m, p in zip(rdmols, receptors)]
+    #         }
+    #         for k, v in clashes.items():
+    #             out[f'mean_clash_score_{k}'] = np.mean(v)
+    #             out[f'frac_no_clashes_{k}'] = np.mean(np.array(v) <= 0.0)
+    #     return out
+    def analyze_sample(self, rdmols, atom_types, bond_types, aa_types=None, receptors=None):
+        out = {}
+        # Distribution of node types
+        kl_div_atom = self.ligand_atom_type_distribution.kl_divergence(atom_types) \
+            if self.ligand_atom_type_distribution is not None else -1
+        out['kl_div_atom_types'] = kl_div_atom
+        # Distribution of edge types
+        kl_div_bond = self.ligand_bond_type_distribution.kl_divergence(bond_types) \
+            if self.ligand_bond_type_distribution is not None else -1
+        out['kl_div_bond_types'] = kl_div_bond
+        if aa_types is not None:
+            kl_div_aa = self.pocket_type_distribution.kl_divergence(aa_types) \
+                if self.pocket_type_distribution is not None else -1
+            out['kl_div_residue_types'] = kl_div_aa
+        # Evaluation
+        results = []
+        if receptors is not None:
+            with tempfile.TemporaryDirectory() as tmpdir:
+                for mol, receptor in zip(tqdm(rdmols, desc='FullEvaluator'), receptors):
+                    receptor_path = Path(tmpdir, 'receptor.pdb')
+                    Chem.MolToPDBFile(receptor, str(receptor_path))
+                    results.append(self.evaluator(mol, receptor_path))
+        else:
+            for mol in tqdm(rdmols, desc='FullEvaluator'):
+                self.evaluator = FullEvaluator(pb_conf='mol')
+                results.append(self.evaluator(mol))
+        results = pd.DataFrame(results)
+        agg_results = aggregated_metrics(results, self.evaluator.dtypes, VALIDITY_METRIC_NAME).fillna(0)
+        agg_results['metric'] = agg_results['metric'].str.replace('.', '/')
+        col_results = collection_metrics(results, self.train_smiles, VALIDITY_METRIC_NAME, exclude_evaluators='fcd')
+        col_results['metric'] = 'collection/' + col_results['metric']
+        all_results = pd.concat([agg_results, col_results])
+        out.update(**dict(all_results[['metric', 'value']].values))
+        return out
+    def sample_zt_given_zs(self, zs_ligand, zs_pocket, s, t, delta_eps_x=None, uncertainty=None):
+        sc_transform = self.get_sc_transform_fn(zs_pocket.get('chi'), zs_ligand['x'], s, None, zs_ligand['mask'], zs_pocket)
+        pred_ligand, pred_residues = self.dynamics(
+            zs_ligand['x'], zs_ligand['h'], zs_ligand['mask'], zs_pocket, s, bonds_ligand=(zs_ligand['bonds'], zs_ligand['e']),
+            sc_transform=sc_transform
+        )
+        if delta_eps_x is not None:
+            pred_ligand['vel'] = pred_ligand['vel'] + delta_eps_x
+        zt_ligand = zs_ligand.copy()
+        zt_ligand['x'] = self.module_x.sample_zt_given_zs(zs_ligand['x'], pred_ligand['vel'], s, t, zs_ligand['mask'])
+        zt_ligand['h'] = self.module_h.sample_zt_given_zs(zs_ligand['h'], pred_ligand['logits_h'], s, t, zs_ligand['mask'])
+        zt_ligand['e'] = self.module_e.sample_zt_given_zs(zs_ligand['e'], pred_ligand['logits_e'], s, t, zs_ligand['edge_mask'])
+        zt_pocket = zs_pocket.copy()
+        if self.flexible_bb:
+            zt_trans_pocket = self.module_trans.sample_zt_given_zs(zs_pocket['x'], pred_residues['trans'], s, t, zs_pocket['mask'])
+            zt_rot_pocket = self.module_rot.sample_zt_given_zs(zs_pocket['axis_angle'], pred_residues['rot'], s, t, zs_pocket['mask'])
+            # update pocket in-place
+            zt_pocket.set_frame(zt_trans_pocket, zt_rot_pocket)
+        if self.flexible:
+            zt_chi_pocket = self.module_chi.sample_zt_given_zs(zs_pocket['chi'][..., :5], pred_residues['chi'], s, t, zs_pocket['mask'])
+            # update pocket in-place
+            zt_pocket.set_chi(zt_chi_pocket)
+        if self.predict_confidence:
+            assert uncertainty is not None
+            dt = (t - s).view(-1)[zt_ligand['mask']]
+            uncertainty['sigma_x_squared'] += (dt * pred_ligand['uncertainty_vel']**2)
+            uncertainty['entropy_h'] += (dt * Categorical(logits=pred_ligand['logits_h']).entropy())
+        return zt_ligand, zt_pocket
+    def simulate(self, ligand, pocket, timesteps, t_start, t_end=1.0,
+                 return_frames=1, guide_log_prob=None):
+        """
+        Take a version of the ligand and pocket (at any time step t_start) and
+        simulate the generative process from t_start to t_end.
+        """
+        assert 0 < return_frames <= timesteps
+        assert timesteps % return_frames == 0
+        assert 0.0 <= t_start < 1.0
+        assert 0 < t_end <= 1.0
+        assert t_start < t_end
+        device = ligand['x'].device
+        n_samples = len(pocket['size'])
+        delta_t = (t_end - t_start) / timesteps
+        # Initialize output tensors
+        out_ligand = {
+            'x': torch.zeros((return_frames, len(ligand['mask']), self.x_dim), device=device),
+            'h': torch.zeros((return_frames, len(ligand['mask']), self.atom_nf), device=device),
+            'e': torch.zeros((return_frames, len(ligand['edge_mask']), self.bond_nf), device=device)
+        }
+        if self.predict_confidence:
+            out_ligand['sigma_x'] = torch.zeros((return_frames, len(ligand['mask'])), device=device)
+            out_ligand['entropy_h'] = torch.zeros((return_frames, len(ligand['mask'])), device=device)
+        out_pocket = {
+            'x': torch.zeros((return_frames, len(pocket['mask']), 3), device=device),  # CA-coord
+            'v': torch.zeros((return_frames, len(pocket['mask']), self.n_atom_aa, 3), device=device)  # difference vectors to all other atoms
+        }
+        cumulative_uncertainty = {
+            'sigma_x_squared': torch.zeros(len(ligand['mask']), device=device),
+            'entropy_h': torch.zeros(len(ligand['mask']), device=device)
+        } if self.predict_confidence else None
+        for i, t in enumerate(torch.linspace(t_start, t_end - delta_t, timesteps)):
+            t_array = torch.full((n_samples, 1), fill_value=t, device=device)
+            if guide_log_prob is not None:
+                raise NotImplementedError('Not yet implemented for flow matching model')
+                alpha_t = self.diffusion_x.schedule.alpha(self.gamma_x(t_array))
+                with torch.enable_grad():
+                    zt_x_ligand.requires_grad = True
+                    g = guide_log_prob(t_array, x=ligand['x'], h=ligand['h'], batch_mask=ligand['mask'],
+                                       bonds=ligand['bonds'], bond_types=ligand['e'])
+                    # Compute gradient w.r.t. coordinates
+                    grad_x_lig = torch.autograd.grad(g.sum(), inputs=ligand['x'])[0]
+                    # clip gradients
+                    g_max = 1.0
+                    clip_mask = (grad_x_lig.norm(dim=-1) > g_max)
+                    grad_x_lig[clip_mask] = \
+                        grad_x_lig[clip_mask] / grad_x_lig[clip_mask].norm(
+                            dim=-1, keepdim=True) * g_max
+                delta_eps_lig = -1 * (1 - alpha_t[lig_mask]).sqrt() * grad_x_lig
+            else:
+                delta_eps_lig = None
+            ligand, pocket = self.sample_zt_given_zs(
+                ligand, pocket, t_array, t_array + delta_t, delta_eps_lig, cumulative_uncertainty)
+            # save frame
+            if (i + 1) % (timesteps // return_frames) == 0:
+                idx = (i + 1) // (timesteps // return_frames)
+                idx = idx - 1
+                out_ligand['x'][idx] = ligand['x'].detach()
+                out_ligand['h'][idx] = ligand['h'].detach()
+                out_ligand['e'][idx] = ligand['e'].detach()
+                if pocket['x'].numel() > 0:
+                    out_pocket['x'][idx] = pocket['x'].detach()
+                    out_pocket['v'][idx] = pocket['v'][:, :self.n_atom_aa, :].detach()
+                if self.predict_confidence:
+                    out_ligand['sigma_x'][idx] = cumulative_uncertainty['sigma_x_squared'].sqrt().detach()
+                    out_ligand['entropy_h'][idx] = cumulative_uncertainty['entropy_h'].detach()
+        # remove frame dimension if only the final molecule is returned
+        out_ligand = {k: v.squeeze(0) for k, v in out_ligand.items()}
+        out_pocket = {k: v.squeeze(0) for k, v in out_pocket.items()}
+        return out_ligand, out_pocket
+    def init_ligand(self, num_nodes_lig, pocket):
+        device = pocket['x'].device
+        n_samples = len(pocket['size'])
+        lig_mask = utils.num_nodes_to_batch_mask(n_samples, num_nodes_lig, device)
+        # only consider upper triangular matrix for symmetry
+        lig_bonds = torch.stack(torch.where(torch.triu(
+            lig_mask[:, None] == lig_mask[None, :], diagonal=1)), dim=0)
+        lig_edge_mask = lig_mask[lig_bonds[0]]
+        # Sample from Normal distribution in the pocket center
+        pocket_com = scatter_mean(pocket['x'], pocket['mask'], dim=0)
+        z0_x = self.module_x.sample_z0(pocket_com, lig_mask)
+        z0_h = self.module_h.sample_z0(lig_mask)
+        z0_e = self.module_e.sample_z0(lig_edge_mask)
+        return TensorDict(**{
+            'x': z0_x, 'h': z0_h, 'e': z0_e, 'mask': lig_mask,
+            'bonds': lig_bonds, 'edge_mask': lig_edge_mask
+        })
+    def init_pocket(self, pocket):
+        if self.flexible_bb:
+            pocket_com = scatter_mean(pocket['x'], pocket['mask'], dim=0)
+            z0_trans = self.module_trans.sample_z0(pocket_com, pocket['mask'])
+            z0_rot = self.module_rot.sample_z0(pocket['mask'])
+            # update pocket in-place
+            pocket.set_frame(z0_trans, z0_rot)
+        if self.flexible:
+            z0_chi = self.module_chi.sample_z0(pocket['mask'])
+            # # DEBUG ##
+            # z0_chi = torch.stack([data_utils.get_torsion_angles(r, device=self.device) for r in pocket['residues']], dim=0)
+            # ####
+            # internal to external coordinates
+            pocket.set_chi(z0_chi)
+        if pocket['x'].numel() == 0:
+            pocket.set_empty_v()
+        return pocket
+    def parse_num_nodes_spec(self, batch, spec=None, size_model=None):
+        if spec == "2d_histogram" or spec is None:  # default option
+            assert "pocket" in batch
+            num_nodes = self.size_distribution.sample_conditional(
+                n1=None, n2=batch['pocket']['size'])
+            # make sure there is at least one potential bond
+            num_nodes[num_nodes < 2] = 2
+        elif isinstance(spec, (int, torch.Tensor)):
+            num_nodes = spec
+        elif spec == "ground_truth":
+            assert "ligand" in batch
+            num_nodes = batch['ligand']['size']
+        elif spec == "nn_prediction":
+            assert size_model is not None
+            assert "pocket" in batch
+            predictions = size_model.forward(batch['pocket'])
+            predictions = torch.softmax(predictions, dim=-1)
+            predictions[:, :5] = 0.0
+            probabilities = predictions / predictions.sum(dim=1, keepdims=True)
+            num_nodes = torch.distributions.Categorical(probabilities).sample()
+        elif isinstance(spec, str) and spec.startswith("uniform"):
+            # expected format: uniform_low_high
+            assert "pocket" in batch
+            left, right = map(int, spec.split("_")[1:])
+            shape = batch['pocket']['size'].shape
+            num_nodes = torch.randint(left, right + 1, shape, dtype=torch.long)
+        else:
+            raise NotImplementedError(f"Invalid size specification {spec}")
+        if self.virtual_nodes:
+            num_nodes += self.add_virtual_max
+        return num_nodes
+    @torch.no_grad()
+    def sample(self, data, n_samples, num_nodes=None, timesteps=None,
+               guide_log_prob=None, size_model=None, **kwargs):
+        # TODO: move somewhere else (like collate_fn)
+        data['pocket'] = Residues(**data['pocket'])
+        timesteps = self.T_sampling if timesteps is None else timesteps
+        if len(data['pocket']['x']) > 0:
+            pocket = data_utils.repeat_items(data['pocket'], n_samples)
+        else:
+            pocket = Residues(**{key: value for key, value in data['pocket'].items()})
+            pocket['name'] = pocket['name'] * n_samples
+            pocket['size'] = pocket['size'].repeat(n_samples)
+            pocket['n_bonds'] = pocket['n_bonds'].repeat(n_samples)
+        _ligand = data_utils.repeat_items(data['ligand'], n_samples)
+        # _ligand = randomize_tensors(_ligand, exclude_keys=['size', 'name'])  # avoid data leakage
+        batch = {"ligand": _ligand, "pocket": pocket}
+        num_nodes = self.parse_num_nodes_spec(batch, spec=num_nodes, size_model=size_model)
+        # Sample from prior
+        if pocket['x'].numel() > 0:
+            ligand = self.init_ligand(num_nodes, pocket)
+        else:
+            ligand = self.init_ligand(num_nodes, _ligand)
+        pocket = self.init_pocket(pocket)
+        # return prior samples
+        if timesteps == 0:
+            # Convert into rdmols
+            rdmols = [build_molecule(coords=m['x'],
+                atom_types=m['h'].argmax(1),
+                bonds=m['bonds'],
+                bond_types=m['e'].argmax(1),
+                atom_decoder=self.atom_decoder, bond_decoder=self.bond_decoder)
+                for m in data_utils.split_entity(ligand.detach().cpu(), edge_types={"e", "edge_mask"}, edge_mask=ligand["edge_mask"])]
+            rdpockets = pocket_to_rdkit(pocket, self.pocket_representation,
+                                        self.atom_encoder, self.atom_decoder,
+                                        self.aa_decoder, self.residue_decoder,
+                                        self.aa_atom_index)
+            return rdmols, rdpockets, _ligand['name']
+        out_tensors_ligand, out_tensors_pocket = self.simulate(
+            ligand, pocket, timesteps, 0.0, 1.0,
+            guide_log_prob=guide_log_prob
+        )
+        # Build mol objects
+        x = out_tensors_ligand['x'].detach().cpu()
+        ligand_type = out_tensors_ligand['h'].argmax(1).detach().cpu()
+        edge_type = out_tensors_ligand['e'].argmax(1).detach().cpu()
+        lig_mask = ligand['mask'].detach().cpu()
+        lig_bonds = ligand['bonds'].detach().cpu()
+        lig_edge_mask = ligand['edge_mask'].detach().cpu()
+        sizes = torch.unique(ligand['mask'], return_counts=True)[1].tolist()
+        offsets = list(accumulate(sizes[:-1], initial=0))
+        mol_kwargs = {
+            'coords': utils.batch_to_list(x, lig_mask),
+            'atom_types': utils.batch_to_list(ligand_type, lig_mask),
+            'bonds': utils.batch_to_list_for_indices(lig_bonds, lig_edge_mask, offsets),
+            'bond_types': utils.batch_to_list(edge_type, lig_edge_mask)
+        }
+        if self.predict_confidence:
+            sigma_x = out_tensors_ligand['sigma_x'].detach().cpu()
+            entropy_h = out_tensors_ligand['entropy_h'].detach().cpu()
+            mol_kwargs['atom_props'] = [
+                {'sigma_x': x[0], 'entropy_h': x[1]}
+                for x in zip(utils.batch_to_list(sigma_x, lig_mask),
+                             utils.batch_to_list(entropy_h, lig_mask))
+            ]
+        mol_kwargs = [{k: v[i] for k, v in mol_kwargs.items()}
+                      for i in range(len(mol_kwargs['coords']))]
+        # Convert into rdmols
+        rdmols = [build_molecule(
+            **m, atom_decoder=self.atom_decoder, bond_decoder=self.bond_decoder)
+            for m in mol_kwargs
+        ]
+        out_pocket = pocket.copy()
+        out_pocket['x'] = out_tensors_pocket['x']
+        out_pocket['v'] = out_tensors_pocket['v']
+        rdpockets = pocket_to_rdkit(out_pocket, self.pocket_representation,
+                                    self.atom_encoder, self.atom_decoder,
+                                    self.aa_decoder, self.residue_decoder,
+                                    self.aa_atom_index)
+        return rdmols, rdpockets, _ligand['name']
+    @torch.no_grad()
+    def sample_chain(self, pocket, keep_frames, num_nodes=None, timesteps=None,
+                     guide_log_prob=None, **kwargs):
+        # TODO: move somewhere else (like collate_fn)
+        pocket = Residues(**pocket)
+        info = {}
+        timesteps = self.T_sampling if timesteps is None else timesteps
+        # n_samples = 1
+        # TODO: get batch_size differently
+        assert len(pocket['mask'].unique()) <= 1, "sample_chain only supports a single sample"
+        # # Pocket's initial center of mass
+        # pocket_com_before = scatter_mean(pocket['x'], pocket['mask'], dim=0)
+        num_nodes = self.parse_num_nodes_spec(batch={"pocket": pocket}, spec=num_nodes)
+        # Sample from prior
+        if pocket['x'].numel() > 0:
+            ligand = self.init_ligand(num_nodes, pocket)
+        else:
+            dummy_pocket = Residues.empty(pocket['x'].device)
+            ligand = self.init_ligand(num_nodes, dummy_pocket)
+        pocket = self.init_pocket(pocket)
+        out_tensors_ligand, out_tensors_pocket = self.simulate(
+            ligand, pocket, timesteps, 0.0, 1.0, guide_log_prob=guide_log_prob, return_frames=keep_frames)
+        # chain_lig = utils.reverse_tensor(chain_lig)
+        # chain_pocket = utils.reverse_tensor(chain_pocket)
+        # chain_bond = utils.reverse_tensor(chain_bond)
+        info['traj_displacement_lig'] = torch.norm(out_tensors_ligand['x'][-1] - out_tensors_ligand['x'][0], dim=-1).mean()
+        info['traj_rms_lig'] = out_tensors_ligand['x'].std(dim=0).mean()
+        # # Repeat last frame to see final sample better.
+        # chain_lig = torch.cat([chain_lig, chain_lig[-1:].repeat(10, 1, 1)], dim=0)
+        # chain_pocket = torch.cat([chain_pocket, chain_pocket[-1:].repeat(10, 1, 1)], dim=0)
+        # chain_bond = torch.cat([chain_bond, chain_bond[-1:].repeat(10, 1, 1)], dim=0)
+        # Flatten
+        assert keep_frames == out_tensors_ligand['x'].size(0) == out_tensors_pocket['x'].size(0)
+        n_atoms = out_tensors_ligand['x'].size(1)
+        n_bonds = out_tensors_ligand['e'].size(1)
+        n_residues = out_tensors_pocket['x'].size(1)
+        device = out_tensors_ligand['x'].device
+        def flatten_tensor(chain):
+            if len(chain.size()) == 3:  # l=0 values
+                return chain.view(-1, chain.size(-1))
+            elif len(chain.size()) == 4:  # vectors
+                return chain.view(-1, chain.size(-2), chain.size(-1))
+            else:
+                warnings.warn(f"Could not flatten frame dimension of tensor with shape {list(chain.size())}")
+                return chain
+        out_tensors_ligand_flat = {k: flatten_tensor(chain) for k, chain in out_tensors_ligand.items()}
+        out_tensors_pocket_flat = {k: flatten_tensor(chain) for k, chain in out_tensors_pocket.items()}
+        # ligand_flat = chain_lig.view(-1, chain_lig.size(-1))
+        # ligand_mask_flat = torch.arange(chain_lig.size(0)).repeat_interleave(chain_lig.size(1)).to(chain_lig.device)
+        ligand_mask_flat = torch.arange(keep_frames).repeat_interleave(n_atoms).to(device)
+        # # pocket_flat = chain_pocket.view(-1, chain_pocket.size(-1))
+        # # pocket_v_flat = pocket['v'].repeat(100, 1, 1)
+        # pocket_flat = chain_pocket.view(-1, chain_pocket.size(-2), chain_pocket.size(-1))
+        # pocket_mask_flat = torch.arange(chain_pocket.size(0)).repeat_interleave(chain_pocket.size(1)).to(chain_pocket.device)
+        pocket_mask_flat = torch.arange(keep_frames).repeat_interleave(n_residues).to(device)
+        # bond_flat = chain_bond.view(-1, chain_bond.size(-1))
+        # bond_mask_flat = torch.arange(chain_bond.size(0)).repeat_interleave(chain_bond.size(1)).to(chain_bond.device)
+        bond_mask_flat = torch.arange(keep_frames).repeat_interleave(n_bonds).to(device)
+        edges_flat = ligand['bonds'].repeat(1, keep_frames)
+        # # Move generated molecule back to the original pocket position
+        # pocket_com_after = scatter_mean(pocket_flat[:, 0, :], pocket_mask_flat, dim=0)
+        # ligand_flat[:, :self.x_dim] += (pocket_com_before - pocket_com_after)[ligand_mask_flat]
+        #
+        # # Move pocket back as well (for visualization purposes)
+        # pocket_flat[:, 0, :] += (pocket_com_before - pocket_com_after)[pocket_mask_flat]
+        # Build ligands
+        x = out_tensors_ligand_flat['x'].detach().cpu()
+        ligand_type = out_tensors_ligand_flat['h'].argmax(1).detach().cpu()
+        ligand_mask_flat = ligand_mask_flat.detach().cpu()
+        bond_mask_flat = bond_mask_flat.detach().cpu()
+        edges_flat = edges_flat.detach().cpu()
+        edge_type = out_tensors_ligand_flat['e'].argmax(1).detach().cpu()
+        offsets = torch.zeros(keep_frames, dtype=int)  # edges_flat is already zero-based
+        molecules = list(
+            zip(utils.batch_to_list(x, ligand_mask_flat),
+                utils.batch_to_list(ligand_type, ligand_mask_flat),
+                utils.batch_to_list_for_indices(edges_flat, bond_mask_flat, offsets),
+                utils.batch_to_list(edge_type, bond_mask_flat)
+                )
+        )
+        # Convert into rdmols
+        ligand_chain = [build_molecule(
+            *graph, atom_decoder=self.atom_decoder,
+            bond_decoder=self.bond_decoder) for graph in molecules
+        ]
+        # Build pockets
+        # as long as the pocket does not change during sampling, we can ust
+        # write it once
+        out_pocket = {
+            'x': out_tensors_pocket_flat['x'],
+            'one_hot': pocket['one_hot'].repeat(keep_frames, 1),
+            'mask': pocket_mask_flat,
+            'v': out_tensors_pocket_flat['v'],
+            'atom_mask': pocket['atom_mask'].repeat(keep_frames, 1),
+        } if self.flexible else pocket
+        pocket_chain = pocket_to_rdkit(out_pocket, self.pocket_representation,
+                                       self.atom_encoder, self.atom_decoder,
+                                       self.aa_decoder, self.residue_decoder,
+                                       self.aa_atom_index)
+        return ligand_chain, pocket_chain, info
+    # def configure_gradient_clipping(self, optimizer, optimizer_idx, gradient_clip_val, gradient_clip_algorithm):
+    # def configure_gradient_clipping(self, optimizer, gradient_clip_val, gradient_clip_algorithm):
+    def configure_gradient_clipping(self, optimizer, *args, **kwargs):
+        if not self.clip_grad:
+            return
+        # Allow gradient norm to be 150% + 2 * stdev of the recent history.
+        max_grad_norm = 1.5 * self.gradnorm_queue.mean() + \
+                        2 * self.gradnorm_queue.std()
+        # hard upper limit
+        max_grad_norm = min(max_grad_norm, 10.0)
+        # Get current grad_norm
+        params = [p for g in optimizer.param_groups for p in g['params']]
+        grad_norm = utils.get_grad_norm(params)
+        # Lightning will handle the gradient clipping
+        self.clip_gradients(optimizer, gradient_clip_val=max_grad_norm,
+                            gradient_clip_algorithm='norm')
+        if float(grad_norm) > max_grad_norm:
+            print(f'Clipped gradient with value {grad_norm:.1f} '
+                  f'while allowed {max_grad_norm:.1f}')
+            grad_norm = max_grad_norm
+        self.gradnorm_queue.add(float(grad_norm))

src/model/loss_utils.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+from torch_scatter import scatter_add, scatter_mean
+from src.constants import atom_decoder, vdw_radii
+_vdw_radii = {**vdw_radii}
+_vdw_radii['NH'] = vdw_radii['N']
+_vdw_radii['N+'] = vdw_radii['N']
+_vdw_radii['O-'] = vdw_radii['O']
+_vdw_radii['NOATOM'] = 0
+vdw_radii_array = torch.tensor([_vdw_radii[a] for a in atom_decoder])
+def clash_loss(ligand_coord, ligand_types, ligand_mask, pocket_coord,
+               pocket_types, pocket_mask):
+    """
+    Computes a clash loss that penalizes interatomic distances smaller than the
+    sum of van der Waals radii between atoms.
+    """
+    ligand_radii = vdw_radii_array[ligand_types].to(ligand_coord.device)
+    pocket_radii = vdw_radii_array[pocket_types].to(pocket_coord.device)
+    dist = torch.sqrt(torch.sum((ligand_coord[:, None, :] - pocket_coord[None, :, :]) ** 2, dim=-1))
+    # dist[ligand_mask[:, None] != pocket_mask[None, :]] = float('inf')
+    # compute linearly decreasing penalty
+    # penalty = max(1 - 1/sum_vdw * d, 0)
+    sum_vdw = ligand_radii[:, None] + pocket_radii[None, :]
+    loss = torch.clamp(1 - dist / sum_vdw, min=0.0)  # (n_ligand, n_pocket)
+    loss = scatter_add(loss, pocket_mask, dim=1)
+    loss = scatter_mean(loss, ligand_mask, dim=0)
+    loss = loss.diag()
+    # # DEBUG (non-differentiable version)
+    # dist = torch.sqrt(torch.sum((ligand_coord[:, None, :] - pocket_coord[None, :, :]) ** 2, dim=-1))
+    # dist[ligand_mask[:, None] != pocket_mask[None, :]] = float('inf')
+    # _loss = torch.clamp(1 - dist / sum_vdw, min=0.0)  # (n_ligand, n_pocket)
+    # _loss = _loss.sum(dim=-1)
+    # _loss = scatter_mean(_loss, ligand_mask, dim=0)
+    # assert torch.allclose(loss, _loss)
+    return loss
+class TimestepSampler:
+    def __init__(self, type='uniform', lowest_t=1, highest_t=500):
+        assert type in {'uniform', 'sigmoid'}
+        self.type = type
+        self.lowest_t = lowest_t
+        self.highest_t = highest_t
+    def __call__(self, n, device=None):
+        if self.type == 'uniform':
+            t_int = torch.randint(self.lowest_t, self.highest_t + 1,
+                                  size=(n, 1), device=device)
+        elif self.type == 'sigmoid':
+            weight_fun = lambda t: 1.45 * torch.sigmoid(-t * 10 / self.highest_t + 5) + 0.05
+            possible_ts = torch.arange(self.lowest_t, self.highest_t + 1, device=device)
+            weights = weight_fun(possible_ts)
+            weights = weights / weights.sum()
+            t_int = possible_ts[torch.multinomial(weights, n, replacement=True)].unsqueeze(-1)
+        return t_int.float()
+class TimestepWeights:
+    def __init__(self, weight_type, a, b):
+        if weight_type != 'sigmoid':
+            raise NotImplementedError("Only sigmoidal loss weighting is available.")
+        # self.weight_fn = lambda t: a * torch.sigmoid((-t + 0.5) * b) + (1 - a / 2)
+        self.weight_fn = lambda t: a * torch.sigmoid((t - 0.5) * b) + (1 - a / 2)
+    def __call__(self, t_array):
+        # normalized t \in [0, 1]
+        # return self.weight_fn(1 - t_array)
+        return self.weight_fn(t_array)

src/model/markov_bridge.py ADDED Viewed

	@@ -0,0 +1,163 @@

+from functools import reduce
+import torch
+import torch.nn.functional as F
+from torch_scatter import scatter_mean, scatter_add
+from src.utils import bvm
+class LinearSchedule:
+    """
+    We use the scheduling parameter \beta to linearly remove noise, i.e.
+    \bar{\beta}_t = 1 - h (h: step size) with
+    \bar{Q}_t = \bar{\beta}_t I + (1 - \bar{\beta}_t) 1_vec z1^T
+    From this, it follows that for each step transition matrix, we have
+    \beta_t = \bar{\beta}_t / \bar{\beta}_{t-h} = \frac{1-t}{1-t+h}
+    """
+    def __init__(self):
+        super().__init__()
+    def beta_bar(self, t):
+        return 1 - t
+    def beta(self, t, step_size):
+        return (1 - t) / (1 - t + step_size)
+class UniformPriorMarkovBridge:
+    """
+    Markov bridge model in which z0 is drawn from a uniform prior.
+    Transitions are defined as:
+    Q_t = \beta_t I + (1 - \beta_t) 1_vec z1^T
+    where z1 is a one-hot representation of the final state.
+    We follow the notation from [1] and multiply transition matrices from the
+    right to one-hot state vectors.
+    We use the scheduling parameter \beta to linearly remove noise, i.e.
+    \bar{\beta}_t = 1 - h (h: step size) with
+    \bar{Q}_t = \bar{\beta}_t I + (1 - \bar{\beta}_t) 1_vec z1^T
+    From this, it follows that for each step transition matrix, we have
+    \beta_t = \bar{\beta}_t / \bar{\beta}_{t-h} = \frac{1-t}{1-t+h}
+    [1] Austin, Jacob, et al.
+    "Structured denoising diffusion models in discrete state-spaces."
+    Advances in Neural Information Processing Systems 34 (2021): 17981-17993.
+    """
+    def __init__(self, dim, loss_type='CE', step_size=None):
+        assert loss_type in ['VLB', 'CE']
+        self.dim = dim
+        self.step_size = step_size  # required for VLB
+        self.schedule = LinearSchedule()
+        self.loss_type = loss_type
+        super(UniformPriorMarkovBridge, self).__init__()
+    @staticmethod
+    def sample_categorical(p):
+        """
+        Sample from categorical distribution defined by probabilities 'p'
+        :param p: (n, dim)
+        :return: one-hot encoded samples (n, dim)
+        """
+        sampled = torch.multinomial(p, 1).squeeze(-1)
+        return F.one_hot(sampled, num_classes=p.size(1)).float()
+    def p_z0(self, batch_mask):
+        return torch.ones((len(batch_mask), self.dim), device=batch_mask.device) / self.dim
+    def sample_z0(self, batch_mask):
+        """ Prior. """
+        z0 = self.sample_categorical(self.p_z0(batch_mask))
+        return z0
+    def p_zt(self, z0, z1, t, batch_mask):
+        Qt_bar = self.get_Qt_bar(t, z1, batch_mask)
+        return bvm(z0, Qt_bar)
+    def sample_zt(self, z0, z1, t, batch_mask):
+        zt = self.sample_categorical(self.p_zt(z0, z1, t, batch_mask))
+        return zt
+    def p_zt_given_zs_and_z1(self, zs, z1, s, t, batch_mask):
+        # 'z1' are one-hot "probabilities" for each class
+        Qt = self.get_Qt(t, s, z1, batch_mask)
+        # from pdb import set_trace; set_trace()
+        q_zs_given_zt = bvm(zs, Qt)
+        return q_zs_given_zt
+    def p_zt_given_zs(self, zs, p_z1_hat, s, t, batch_mask):
+        """
+        Note that x can also represent a categorical distribution to compute
+        transitions more efficiently at sampling time:
+        p(z_t|z_s) = \sum_{\hat{z}_1} p(z_t | z_s, \hat{z}_1) * p(\hat{z}_1 | z_s)
+                   = \sum_i z_s (\beta_t I + (1 - \beta_t) 1_vec z1_i^T) * \hat{p}_i
+                   = \beta_t z_s I + (1 - \beta_t) z_s 1_vec \hat{p}^t
+        """
+        return self.p_zt_given_zs_and_z1(zs, p_z1_hat, s, t, batch_mask)
+    def sample_zt_given_zs(self, zs, z1_logits, s, t, batch_mask):
+        p_z1 = z1_logits.softmax(dim=-1)
+        zt = self.sample_categorical(self.p_zt_given_zs(zs, p_z1, s, t, batch_mask))
+        return zt
+    def compute_loss(self, pred_logits, zs, z1, batch_mask, s, t, reduce='mean'):
+        """ Compute loss per sample. """
+        assert reduce in {'mean', 'sum', 'none'}
+        if self.loss_type == 'CE':
+            loss = F.cross_entropy(pred_logits, z1, reduction='none')
+        else:  # VLB
+            true_p_zs = self.p_zt_given_zs_and_z1(zs, z1, s, t, batch_mask)
+            pred_p_zs = self.p_zt_given_zs(zs, pred_logits.softmax(dim=-1), s, t, batch_mask)
+            loss = F.kl_div(pred_p_zs.log(), true_p_zs, reduction='none').sum(dim=-1)
+        if reduce == 'mean':
+            loss = scatter_mean(loss, batch_mask, dim=0)
+        elif reduce == 'sum':
+            loss = scatter_add(loss, batch_mask, dim=0)
+        return loss
+    def get_Qt(self, t, s, z1, batch_mask):
+        """ Returns one-step transition matrix from step s to step t. """
+        beta_t_given_s = self.schedule.beta(t, t - s)
+        beta_t_given_s = beta_t_given_s.unsqueeze(-1)[batch_mask]
+        # Q_t = beta_t * I + (1 - beta_t) * ones (dot) z1^T
+        Qt = beta_t_given_s * torch.eye(self.dim, device=t.device).unsqueeze(0) + \
+             (1 - beta_t_given_s) * z1.unsqueeze(1)
+             # (1 - beta_t_given_s) * (torch.ones(self.dim, 1, device=t.device) @ z1)
+        # assert (Qt.sum(-1) == 1).all()
+        return Qt
+    def get_Qt_bar(self, t, z1, batch_mask):
+        """ Returns transition matrix from step 0 to step t. """
+        beta_bar_t = self.schedule.beta_bar(t)
+        beta_bar_t = beta_bar_t.unsqueeze(-1)[batch_mask]
+        # Q_t_bar = beta_bar * I + (1 - beta_bar) * ones (dot) z1^T
+        Qt_bar = beta_bar_t * torch.eye(self.dim, device=t.device).unsqueeze(0) + \
+                 (1 - beta_bar_t) * z1.unsqueeze(1)
+                 # (1 - beta_bar_t) * (torch.ones(self.dim, 1, device=t.device) @ z1)
+        # assert (Qt_bar.sum(-1) == 1).all()
+        return Qt_bar
+class MarginalPriorMarkovBridge(UniformPriorMarkovBridge):
+    def __init__(self, dim, prior_p, loss_type='CE', step_size=None):
+        self.prior_p = prior_p
+        print('Marginal Prior MB')
+        super(MarginalPriorMarkovBridge, self).__init__(dim, loss_type, step_size)
+    def p_z0(self, batch_mask):
+        device = batch_mask.device
+        p = torch.ones((len(batch_mask), self.dim), device=device) * self.prior_p.view(1, -1).to(device)
+        return p

src/sample_and_evaluate.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import argparse
+import sys
+import yaml
+import torch
+import numpy as np
+import pickle
+from argparse import Namespace
+from pathlib import Path
+basedir = Path(__file__).resolve().parent.parent
+sys.path.append(str(basedir))
+from src import utils
+from src.utils import dict_to_namespace, namespace_to_dict
+from src.analysis.visualization_utils import mols_to_pdbfile, mol_as_pdb
+from src.data.data_utils import TensorDict, Residues
+from src.data.postprocessing import process_all
+from src.model.lightning import DrugFlow
+from src.sbdd_metrics.evaluation import compute_all_metrics_drugflow
+from tqdm import tqdm
+from pdb import set_trace
+def combine(base_args, override_args):
+    assert not isinstance(base_args, dict)
+    assert not isinstance(override_args, dict)
+    arg_dict = base_args.__dict__
+    for key, value in override_args.__dict__.items():
+        if key not in arg_dict or arg_dict[key] is None:  # parameter not provided previously
+            print(f"Add parameter {key}: {value}")
+            arg_dict[key] = value
+        elif isinstance(value, Namespace):
+            arg_dict[key] = combine(arg_dict[key], value)
+        else:
+            print(f"Replace parameter {key}: {arg_dict[key]} -> {value}")
+            arg_dict[key] = value
+    return base_args
+def path_to_str(input_dict):
+    for key, value in input_dict.items():
+        if isinstance(value, dict):
+            input_dict[key] = path_to_str(value)
+        else:
+            input_dict[key] = str(value) if isinstance(value, Path) else value
+    return input_dict
+def sample(cfg, model_params, samples_dir, job_id=0, n_jobs=1):
+    print('Sampling...')
+    model = DrugFlow.load_from_checkpoint(cfg.checkpoint, map_location=cfg.device, strict=False,
+                                          **model_params)
+    model.setup(stage='fit' if cfg.set == 'train' else cfg.set)
+    model.eval().to(cfg.device)
+    dataloader = getattr(model, f'{cfg.set}_dataloader')()
+    print(f'Real batch size is {dataloader.batch_size * cfg.n_samples}')
+    name2count = {}
+    for i, data in enumerate(tqdm(dataloader)):
+        if i % n_jobs != job_id:
+            print(f'Skipping batch {i}')
+            continue
+        new_data = {
+            'ligand': TensorDict(**data['ligand']).to(cfg.device),
+            'pocket': Residues(**data['pocket']).to(cfg.device),
+        }
+        try:
+            rdmols, rdpockets, names = model.sample(
+                data=new_data,
+                n_samples=cfg.n_samples,
+                num_nodes=("ground_truth" if cfg.sample_with_ground_truth_size else None)
+            )
+        except Exception as e:
+            if cfg.set == 'train':
+                names = data['ligand']['name']
+                print(f'Failed to sample for {names}: {e}')
+                continue
+            else:
+                raise e
+        for mol, pocket, name in zip(rdmols, rdpockets, names):
+            name = name.replace('.sdf', '')
+            idx = name2count.setdefault(name, 0)
+            output_dir = Path(samples_dir, name)
+            output_dir.mkdir(parents=True, exist_ok=True)
+            if cfg.postprocess:
+                mol = process_all(mol, largest_frag=True, adjust_aromatic_Ns=True, relax_iter=0)
+            for prop in mol.GetAtoms()[0].GetPropsAsDict().keys():
+                # compute avg uncertainty
+                mol.SetDoubleProp(prop, np.mean([a.GetDoubleProp(prop) for a in mol.GetAtoms()]))
+                # visualise local differences
+                out_pdb_path = Path(output_dir, f'{idx}_ligand_{prop}.pdb')
+                mol_as_pdb(mol, out_pdb_path, bfactor=prop)
+            out_sdf_path = Path(output_dir, f'{idx}_ligand.sdf')
+            out_pdb_path = Path(output_dir, f'{idx}_pocket.pdb')
+            utils.write_sdf_file(out_sdf_path, [mol])
+            mols_to_pdbfile([pocket], out_pdb_path)
+            name2count[name] += 1
+def evaluate(cfg, model_params, samples_dir):
+    print('Evaluation...')
+    data, table_detailed, table_aggregated = compute_all_metrics_drugflow(
+        in_dir=samples_dir,
+        gnina_path=model_params['train_params'].gnina,
+        reduce_path=cfg.reduce,
+        reference_smiles_path=Path(model_params['train_params'].datadir, 'train_smiles.npy'),
+        n_samples=cfg.n_samples,
+        exclude_evaluators=[] if cfg.exclude_evaluators is None else cfg.exclude_evaluators,
+    )
+    with open(Path(samples_dir, 'metrics_data.pkl'), 'wb') as f:
+        pickle.dump(data, f)
+    table_detailed.to_csv(Path(samples_dir, 'metrics_detailed.csv'), index=False)
+    table_aggregated.to_csv(Path(samples_dir, 'metrics_aggregated.csv'), index=False)
+if __name__ == "__main__":
+    p = argparse.ArgumentParser()
+    p.add_argument('--config', type=str)
+    p.add_argument('--job_id', type=int, default=0, help='Job ID')
+    p.add_argument('--n_jobs', type=int, default=1, help='Number of jobs')
+    args = p.parse_args()
+    with open(args.config, 'r') as f:
+        cfg = yaml.safe_load(f)
+        cfg = dict_to_namespace(cfg)
+    utils.set_deterministic(seed=cfg.seed)
+    utils.disable_rdkit_logging()
+    model_params = torch.load(cfg.checkpoint, map_location=cfg.device)['hyper_parameters']
+    if 'model_args' in cfg:
+        ckpt_args = dict_to_namespace(model_params)
+        model_params = combine(ckpt_args, cfg.model_args).__dict__
+    ckpt_path = Path(cfg.checkpoint)
+    ckpt_name = ckpt_path.parts[-1].split('.')[0]
+    n_steps = model_params['simulation_params'].n_steps
+    samples_dir = Path(cfg.sample_outdir, cfg.set, f'{ckpt_name}_T={n_steps}') or \
+                  Path(ckpt_path.parent.parent, 'samples', cfg.set, f'{ckpt_name}_T={n_steps}')
+    assert cfg.set in {'val', 'test', 'train'}
+    samples_dir.mkdir(parents=True, exist_ok=True)
+    # save configs
+    with open(Path(samples_dir, 'model_params.yaml'), 'w') as f:
+        yaml.dump(path_to_str(namespace_to_dict(model_params)), f)
+    with open(Path(samples_dir, 'sampling_params.yaml'), 'w') as f:
+        yaml.dump(path_to_str(namespace_to_dict(cfg)), f)
+    if cfg.sample:
+        sample(cfg, model_params, samples_dir, job_id=args.job_id, n_jobs=args.n_jobs)
+    if cfg.evaluate:
+        assert args.job_id == 0 and args.n_jobs == 1, 'Evaluation is not parallelised on GPU machines'
+        evaluate(cfg, model_params, samples_dir)

src/sbdd_metrics/evaluation.py ADDED Viewed

	@@ -0,0 +1,239 @@

+import os
+import sys
+import re
+from pathlib import Path
+from typing import Collection, List, Dict, Type
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from .metrics import FullEvaluator, FullCollectionEvaluator
+AUXILIARY_COLUMNS = ['sample', 'sdf_file', 'pdb_file', 'subdir']
+VALIDITY_METRIC_NAME = 'medchem.valid'
+def get_data_type(key: str, data_types: Dict[str, Type], default=float) -> Type:
+    found_data_type_key = None
+    found_data_type_value = None
+    for data_type_key, data_type_value in data_types.items():
+        if re.match(data_type_key, key) is not None:
+            if found_data_type_key is not None:
+                raise ValueError(f'Multiple data type keys match [{key}]: {found_data_type_key}, {data_type_key}')
+            found_data_type_value = data_type_value
+            found_data_type_key = data_type_key
+    if found_data_type_key is None:
+        if default is None:
+            raise KeyError(key)
+        else:
+            found_data_type_value = default
+    return found_data_type_value
+def convert_data_to_table(data: List[Dict], data_types: Dict[str, Type]) -> pd.DataFrame:
+    """
+    Converts data from `evaluate_drugflow` to a detailed table
+    """
+    table = []
+    for entry in data:
+        table_entry = {}
+        for key, value in entry.items():
+            if key in AUXILIARY_COLUMNS:
+                table_entry[key] = value
+                continue
+            if get_data_type(key, data_types) != list:
+                table_entry[key] = value
+        table.append(table_entry)
+    return pd.DataFrame(table)
+def aggregated_metrics(table: pd.DataFrame, data_types: Dict[str, Type], validity_metric_name: str = None):
+    """
+    Args:
+        table (pd.DataFrame): table with metrics computed for each sample
+        data_types (Dict[str, Type]): dictionary with data types for each column
+        validity_metric_name (str): name of the column that has validity metric
+    Returns:
+        agg_table (pd.DataFrame): table with columns ['metric', 'value', 'std']
+    """
+    aggregated_results = []
+    # If validity column name is provided:
+    #    1. compute validity on the entire data
+    #    2. drop all invalid molecules to compute the rest
+    if validity_metric_name is not None:
+        aggregated_results.append({
+            'metric': validity_metric_name,
+            'value': table[validity_metric_name].fillna(False).astype(float).mean(),
+            'std': None,
+        })
+        table = table[table[validity_metric_name]]
+    # Compute aggregated metrics + standard deviations where applicable
+    for column in table.columns:
+        if column in AUXILIARY_COLUMNS + [validity_metric_name] or get_data_type(column, data_types) == str:
+            continue
+        with pd.option_context("future.no_silent_downcasting", True):
+            if get_data_type(column, data_types) == bool:
+                values = table[column].fillna(0).values.astype(float).mean()
+                std = None
+            else:
+                values = table[column].dropna().values.astype(float).mean()
+                std = table[column].dropna().values.astype(float).std()
+        aggregated_results.append({
+            'metric': column,
+            'value': values,
+            'std': std,
+        })
+    agg_table = pd.DataFrame(aggregated_results)
+    return agg_table
+def collection_metrics(
+        table: pd.DataFrame,
+        reference_smiles: Collection[str],
+        validity_metric_name: str = None,
+        exclude_evaluators: Collection[str] = [],
+):
+    """
+    Args:
+        table (pd.DataFrame): table with metrics computed for each sample
+        reference_smiles (Collection[str]): list of reference SMILES (e.g. training set)
+        validity_metric_name (str): name of the column that has validity metric
+        exclude_evaluators (Collection[str]): Evaluator IDs to exclude
+    Returns:
+        col_table (pd.DataFrame): table with columns ['metric', 'value']
+    """
+    # If validity column name is provided drop all invalid molecules
+    if validity_metric_name is not None:
+        table = table[table[validity_metric_name]]
+    evaluator = FullCollectionEvaluator(reference_smiles, exclude_evaluators=exclude_evaluators)
+    smiles = table['representation.smiles'].values
+    if len(smiles) == 0:
+        print('No valid input molecules')
+        return pd.DataFrame(columns=['metric', 'value'])
+    collection_metrics = evaluator(smiles)
+    results = [
+        {'metric': key, 'value': value}
+        for key, value in collection_metrics.items()
+    ]
+    col_table = pd.DataFrame(results)
+    return col_table
+def evaluate_drugflow_subdir(
+        in_dir: Path,
+        evaluator: FullEvaluator,
+        desc: str = None,
+        n_samples: int = None,
+) -> List[Dict]:
+    """
+    Computes per-molecule metrics for a single directory of samples for one target
+    """
+    results = []
+    valid_files = [
+        int(fname.split('_')[0])
+        for fname in os.listdir(in_dir)
+        if fname.endswith('_ligand.sdf') and not fname.startswith('.')
+    ]
+    if len(valid_files) == 0:
+        return pd.DataFrame()
+    upper_bound = max(valid_files) + 1
+    if n_samples is not None:
+        upper_bound = min(upper_bound, n_samples)
+    for i in tqdm(range(upper_bound), desc=desc, file=sys.stdout):
+        in_mol = Path(in_dir, f'{i}_ligand.sdf')
+        in_prot = Path(in_dir, f'{i}_pocket.pdb')
+        res = evaluator(in_mol, in_prot)
+        res['sample'] = i
+        res['sdf_file'] = str(in_mol)
+        res['pdb_file'] = str(in_prot)
+        results.append(res)
+    return results
+def evaluate_drugflow(
+        in_dir: Path,
+        evaluator: FullEvaluator,
+        n_samples: int = None,
+        job_id: int = 0,
+        n_jobs: int = 1,
+) -> List[Dict]:
+    """
+    1. Computes per-molecule metrics for all single directories of samples
+    2. Aggregates these metrics
+    3. Computes additional collection metrics (if `reference_smiles_path` is provided)
+    """
+    data = []
+    total_number_of_subdirs = len([path for path in in_dir.glob("[!.]*") if os.path.isdir(path)])
+    i = 0
+    for subdir in in_dir.glob("[!.]*"):
+        if not os.path.isdir(subdir):
+            continue
+        i += 1
+        if (i - 1) % n_jobs != job_id:
+            continue
+        curr_data = evaluate_drugflow_subdir(
+            in_dir=subdir,
+            evaluator=evaluator,
+            desc=f'[{i}/{total_number_of_subdirs}] {str(subdir.name)}',
+            n_samples=n_samples,
+        )
+        for entry in curr_data:
+            entry['subdir'] = str(subdir)
+            data.append(entry)
+    return data
+def compute_all_metrics_drugflow(
+        in_dir: Path,
+        gnina_path: Path,
+        reduce_path: Path = None,
+        reference_smiles_path: Path = None,
+        n_samples: int = None,
+        validity_metric_name: str = VALIDITY_METRIC_NAME,
+        exclude_evaluators: Collection[str] = [],
+        job_id: int = 0,
+        n_jobs: int = 1,
+):
+    evaluator = FullEvaluator(gnina=gnina_path, reduce=reduce_path, exclude_evaluators=exclude_evaluators)
+    data = evaluate_drugflow(in_dir=in_dir, evaluator=evaluator, n_samples=n_samples, job_id=job_id, n_jobs=n_jobs)
+    table_detailed = convert_data_to_table(data, evaluator.dtypes)
+    table_aggregated = aggregated_metrics(
+        table_detailed,
+        data_types=evaluator.dtypes,
+        validity_metric_name=validity_metric_name
+    )
+    # Add collection metrics (uniqueness, novelty, FCD, etc.) if reference smiles are provided
+    if reference_smiles_path is not None:
+        reference_smiles = np.load(reference_smiles_path)
+        col_metrics = collection_metrics(
+            table=table_detailed,
+            reference_smiles=reference_smiles,
+            validity_metric_name=validity_metric_name,
+            exclude_evaluators=exclude_evaluators
+        )
+        table_aggregated = pd.concat([table_aggregated, col_metrics])
+    return data, table_detailed, table_aggregated

src/sbdd_metrics/fpscores.pkl.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10dcef9340c873e7b987924461b0af5365eb8dd96be607203debe8ddf80c1e73
+size 3848394

src/sbdd_metrics/interactions.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import prody
+import prolif as plf
+import pandas as pd
+import subprocess
+from io import StringIO
+from prolif.fingerprint import Fingerprint
+from prolif.plotting.complex3d import Complex3D
+from prolif.residue import ResidueId
+from prolif.ifp import IFP
+from rdkit import Chem
+from tqdm import tqdm
+prody.confProDy(verbosity='none')
+INTERACTION_LIST = [
+    'Anionic', 'Cationic', # Salt Bridges ~400 kJ/mol
+    'HBAcceptor', 'HBDonor', # Hydrogen bonds ~10 kJ/mol
+    'XBAcceptor', 'XBDonor', # Halogen bonds ~5-30 kJ/mol
+    'CationPi', 'PiCation', # 5-10 kJ/mol
+    'PiStacking', # ~2-10 kJ/mol
+    'Hydrophobic', # 1-10 kJ/mol
+]
+INTERACTION_ALIASES = {
+    'Anionic': 'SaltBridge',
+    'Cationic': 'SaltBridge',
+    'HBAcceptor': 'HBAcceptor',
+    'HBDonor': 'HBDonor',
+    'XBAcceptor': 'HalogenBond',
+    'XBDonor': 'HalogenBond',
+    'CationPi': 'CationPi',
+    'PiCation': 'PiCation',
+    'PiStacking': 'PiStacking',
+    'Hydrophobic': 'Hydrophobic',
+}
+INTERACTION_COLORS = {
+    'SaltBridge': '#eba823',
+    'HBDonor': '#3d5dfc',
+    'HBAcceptor': '#3d5dfc',
+    'HalogenBond': '#53f514',
+    'CationPi': '#ff0000',
+    'PiCation': '#ff0000',
+    'PiStacking': '#e359d8',
+    'Hydrophobic': '#c9c5c5',
+}
+INTERACTION_IMPORTANCE = ['SaltBridge', 'HydrogenBond', 'HBAcceptor', 'HBDonor', 'CationPi', 'PiCation', 'PiStacking', 'Hydrophobic']
+REDUCE_EXEC = './reduce'
+def remove_residue_by_atomic_number(structure, resnum, chain_id, icode):
+    exclude_selection = f'not (chain {chain_id} and resnum {resnum} and icode {icode})'
+    structure = structure.select(exclude_selection)
+    return structure
+def read_protein(protein_path, verbose=False, reduce_exec=REDUCE_EXEC):
+    structure = prody.parsePDB(protein_path).select('protein')
+    hydrogens = structure.select('hydrogen')
+    if hydrogens is None or len(hydrogens) < len(set(structure.getResnums())):
+        if verbose:
+            print('Target structure is not protonated. Adding hydrogens...')
+        reduce_cmd = f'{str(reduce_exec)} {protein_path}'
+        reduce_result = subprocess.run(reduce_cmd, shell=True, capture_output=True, text=True)
+        if reduce_result.returncode != 0:
+            raise RuntimeError('Error during reduce execution:', reduce_result.stderr)
+        pdb_content = reduce_result.stdout
+        stream = StringIO()
+        stream.write(pdb_content)
+        stream.seek(0)
+        structure = prody.parsePDBStream(stream).select('protein')
+    # Select only one (largest) altloc
+    altlocs = set(structure.getAltlocs())
+    try:
+        best_altloc = max(altlocs, key=lambda a: structure.select(f'altloc "{a}"').numAtoms())
+        structure = structure.select(f'altloc "{best_altloc}"')
+    except TypeError:
+        # Strange thing that happens only once in the beginning sometimes...
+        best_altloc = max(altlocs, key=lambda a: structure.select(f'altloc "{a}"').numAtoms())
+        structure = structure.select(f'altloc "{best_altloc}"')
+    return prepare_protein(structure, to_exclude=[], verbose=verbose)
+def prepare_protein(input_structure, to_exclude=[], verbose=False):
+    structure = input_structure.copy()
+    # Remove residues with bad atoms
+    if verbose and len(to_exclude) > 0:
+        print(f'Removing {len(to_exclude)} residues...')
+    for resnum, chain_id, icode in to_exclude:
+        exclude_selection = f'not (chain {chain_id} and resnum {resnum})'
+        structure = structure.select(exclude_selection)
+    # Write new PDB content to the stream
+    stream = StringIO()
+    prody.writePDBStream(stream, structure)
+    stream.seek(0)
+    # Sanitize
+    rdprot = Chem.MolFromPDBBlock(stream.read(), sanitize=False, removeHs=False)
+    try:
+        Chem.SanitizeMol(rdprot)
+        plfprot = plf.Molecule(rdprot)
+        return plfprot
+    except Chem.AtomValenceException as e:
+        atom_num = int(e.args[0].replace('Explicit valence for atom # ', '').split()[0])
+        info = rdprot.GetAtomWithIdx(atom_num).GetPDBResidueInfo()
+        resnum = info.GetResidueNumber()
+        chain_id = info.GetChainId()
+        icode = f'"{info.GetInsertionCode()}"'
+        to_exclude_next = to_exclude + [(resnum, chain_id, icode)]
+        if verbose:
+            print(f'[{len(to_exclude_next)}] Removing broken residue with atom={atom_num}, resnum={resnum}, chain_id={chain_id}, icode={icode}')
+        return prepare_protein(input_structure, to_exclude=to_exclude_next)
+def prepare_ligand(mol):
+    Chem.SanitizeMol(mol)
+    mol = Chem.AddHs(mol, addCoords=True)
+    ligand_plf = plf.Molecule.from_rdkit(mol)
+    return ligand_plf
+def sdf_reader(sdf_path, proress_bar=False):
+    supp = Chem.SDMolSupplier(sdf_path, removeHs=True, sanitize=False)
+    for mol in tqdm(supp) if progress_bar else supp:
+        yield prepare_ligand(mol)
+def profile_detailed(
+        ligand_plf, protein_plf, interaction_list=INTERACTION_LIST, ligand_name='ligand', protein_name='protein'
+    ):
+    fp = Fingerprint(interactions=interaction_list)
+    fp.run_from_iterable(lig_iterable=[ligand_plf], prot_mol=protein_plf, progress=False)
+    profile = []
+    for ligand_residue in ligand_plf.residues:
+        for protein_residue in protein_plf.residues:
+            metadata = fp.metadata(ligand_plf[ligand_residue], protein_plf[protein_residue])
+            for int_name, int_metadata in metadata.items():
+                for int_instance in int_metadata:
+                    profile.append({
+                        'ligand': ligand_name,
+                        'protein': protein_name,
+                        'ligand_residue': str(ligand_residue),
+                        'protein_residue': str(protein_residue),
+                        'interaction': int_name,
+                        'alias': INTERACTION_ALIASES[int_name],
+                        'ligand_atoms': ','.join(map(str, int_instance['indices']['ligand'])),
+                        'protein_atoms': ','.join(map(str, int_instance['indices']['protein'])),
+                        'ligand_orig_atoms': ','.join(map(str, int_instance['parent_indices']['ligand'])),
+                        'protein_orig_atoms': ','.join(map(str, int_instance['parent_indices']['protein'])),
+                        'distance': int_instance['distance'],
+                        'plane_angle': int_instance.get('plane_angle', None),
+                        'normal_to_centroid_angle': int_instance.get('normal_to_centroid_angle', None),
+                        'intersect_distance': int_instance.get('intersect_distance', None),
+                        'intersect_radius': int_instance.get('intersect_radius', None),
+                        'pi_ring': int_instance.get('pi_ring', None),
+                    })
+    return pd.DataFrame(profile)
+def map_orig_atoms_to_new(atoms, mol):
+    orig2new = dict()
+    for atom in mol.GetAtoms():
+        orig2new[atom.GetUnsignedProp("mapindex")] = atom.GetIdx()
+    atoms = list(map(int, atoms.split(',')))
+    new_atoms = ','.join(map(str, [orig2new[atom] for atom in atoms]))
+    return new_atoms
+def visualize(profile, ligand_plf, protein_plf):
+    metadata = dict()
+    for _, row in profile.iterrows():
+        if 'ligand_atoms' not in row:
+            row['ligand_atoms'] = map_orig_atoms_to_new(row['ligand_orig_atoms'], ligand_plf)
+        if 'protein_atoms' not in row:
+            row['protein_atoms'] = map_orig_atoms_to_new(row['protein_orig_atoms'], protein_plf[row['residue']])
+        namenum, chain = row['residue'].split('.')
+        name = namenum[:3]
+        num = int(namenum[3:])
+        protres = ResidueId(name=name, number=num, chain=chain)
+        key = (ResidueId(name='UNL', number=1, chain=None), protres)
+        metadata.setdefault(key, dict())
+        interaction = {
+            'indices': {
+                'ligand': tuple(map(int, row['ligand_atoms'].split(','))),
+                'protein': tuple(map(int, row['protein_atoms'].split(','))),
+            },
+            'parent_indices': {
+                'ligand': tuple(map(int, row['ligand_atoms'].split(','))),
+                'protein': tuple(map(int, row['protein_atoms'].split(','))),
+            },
+            'distance': row['distance'],
+        }
+        # if row['plane_angle'] is not None:
+        #     interaction['plane_angle'] = row['plane_angle']
+        # if row['normal_to_centroid_angle'] is not None:
+        #     interaction['normal_to_centroid_angle'] = row['normal_to_centroid_angle']
+        # if row['intersect_distance'] is not None:
+        #     interaction['intersect_distance'] = row['intersect_distance']
+        # if row['intersect_radius'] is not None:
+        #     interaction['intersect_radius'] = row['intersect_radius']
+        # if row['pi_ring'] is not None:
+        #     interaction['pi_ring'] = row['pi_ring']
+        metadata[key].setdefault(row['alias'], list()).append(interaction)
+    ifp = IFP(metadata)
+    fp = Fingerprint(interactions=INTERACTION_LIST, vicinity_cutoff=8.0)
+    fp.ifp = {0: ifp}
+    Complex3D.COLORS.update(INTERACTION_COLORS)
+    v = fp.plot_3d(ligand_mol=ligand_plf, protein_mol=protein_plf, frame=0)
+    return v

src/sbdd_metrics/metrics.py ADDED Viewed

	@@ -0,0 +1,929 @@

+import multiprocessing
+import subprocess
+import tempfile
+from abc import abstractmethod
+from collections import defaultdict
+from pathlib import Path
+from typing import Union, Dict, Collection, Set, Optional
+import signal
+import numpy as np
+import pandas as pd
+from unittest.mock import patch
+from scipy.spatial.distance import jensenshannon
+from fcd import get_fcd
+from posebusters import PoseBusters
+from posebusters.modules.distance_geometry import _get_bond_atom_indices, _get_angle_atom_indices
+from rdkit import Chem, RDLogger
+from rdkit.Chem import Descriptors, Crippen, Lipinski, QED, KekulizeException, AtomKekulizeException
+from rdkit.Chem.rdForceFieldHelpers import UFFGetMoleculeForceField
+from scipy.spatial.distance import jensenshannon
+from tqdm import tqdm
+from useful_rdkit_utils import REOS, RingSystemLookup, get_min_ring_frequency, RingSystemFinder
+from .interactions import INTERACTION_LIST, prepare_ligand, read_protein, profile_detailed
+from .sascorer import calculateScore
+def timeout_handler(signum, frame):
+    raise TimeoutError('Timeout')
+BOND_SYMBOLS = {
+    Chem.rdchem.BondType.SINGLE: '-',
+    Chem.rdchem.BondType.DOUBLE: '=',
+    Chem.rdchem.BondType.TRIPLE: '#',
+    Chem.rdchem.BondType.AROMATIC: ':',
+}
+def is_nan(value):
+    return value is None or pd.isna(value) or np.isnan(value)
+def safe_run(func, timeout, **kwargs):
+    def _run(f, q, **kwargs):
+        r = f(**kwargs)
+        q.put(r)
+    queue = multiprocessing.Queue()
+    process = multiprocessing.Process(target=_run, kwargs={'f': func, 'q': queue, **kwargs})
+    process.start()
+    process.join(timeout)
+    if process.is_alive():
+        print(f"Function {func} didn't finish in {timeout} seconds. Terminating it.")
+        process.terminate()
+        process.join()
+        return None
+    elif not queue.empty():
+        return queue.get()
+    return None
+class AbstractEvaluator:
+    ID = None
+    def __call__(self, molecule: Union[str, Path, Chem.Mol], protein: Union[str, Path] = None,
+                 timeout=350):
+        """
+        Args:
+            molecule (Union[str, Path, Chem.Mol]): input molecule
+            protein (str): target protein
+        Returns:
+            metrics (dict): dictionary of metrics
+        """
+        RDLogger.DisableLog('rdApp.*')
+        self.check_format(molecule, protein)
+        # timeout handler
+        signal.signal(signal.SIGALRM, timeout_handler)
+        try:
+            signal.alarm(timeout)
+            results = self.evaluate(molecule, protein)
+        except TimeoutError:
+            print(f'Error when evaluating [{self.ID}]: Timeout after {timeout} seconds')
+            signal.alarm(0)
+            return {}
+        except Exception as e:
+            print(f'Error when evaluating [{self.ID}]: {e}')
+            signal.alarm(0)
+            return {}
+        finally:
+            signal.alarm(0)
+        return self.add_id(results)
+    def add_id(self, results):
+        if self.ID is not None:
+            return {f'{self.ID}.{key}': value for key, value in results.items()}
+        else:
+            return results
+    @abstractmethod
+    def evaluate(self, molecule: Union[str, Path, Chem.Mol], protein: Union[str, Path]) -> Dict[str, Union[int, float, str]]:
+        raise NotImplementedError
+    @staticmethod
+    def check_format(molecule, protein):
+        assert isinstance(molecule, (str, Path, Chem.Mol)), 'Supported molecule types: str, Path, Chem.Mol'
+        assert protein is None or isinstance(protein, (str, Path)), 'Supported protein types: str'
+        if isinstance(molecule, (str, Path)):
+            supp = Chem.SDMolSupplier(str(molecule), sanitize=False)
+            assert len(supp) == 1, 'Only one molecule per file is supported'
+    @staticmethod
+    def load_molecule(molecule):
+        if isinstance(molecule, (str, Path)):
+            return Chem.SDMolSupplier(str(molecule), sanitize=False)[0]
+        return Chem.Mol(molecule)  # create copy to avoid overriding properties of the input molecule
+    @staticmethod
+    def save_molecule(molecule, sdf_path):
+        if isinstance(molecule, (str, Path)):
+            return molecule
+        with Chem.SDWriter(str(sdf_path)) as w:
+            try:
+                w.write(molecule)
+            except (RuntimeError, ValueError) as e:
+                if isinstance(e, (KekulizeException, AtomKekulizeException)):
+                    w.SetKekulize(False)
+                    w.write(molecule)
+                    w.SetKekulize(True)
+                else:
+                    w.write(Chem.Mol())
+                    print('[AbstractEvaluator] Error when saving the molecule')
+        return sdf_path
+    @property
+    def dtypes(self):
+        return self.add_id(self._dtypes)
+    @property
+    @abstractmethod
+    def _dtypes(self):
+        raise NotImplementedError
+class RepresentationEvaluator(AbstractEvaluator):
+    ID = 'representation'
+    def evaluate(self, molecule, protein=None):
+        molecule = self.load_molecule(molecule)
+        try:
+            smiles = Chem.MolToSmiles(molecule)
+        except:
+            smiles = None
+        return {'smiles': smiles}
+    @property
+    def _dtypes(self):
+        return {'smiles': str}
+class MolPropertyEvaluator(AbstractEvaluator):
+    ID = 'mol_props'
+    def evaluate(self, molecule, protein=None):
+        molecule = self.load_molecule(molecule)
+        return {k: v for k, v in molecule.GetPropsAsDict().items() if isinstance(v, float)}
+    @property
+    def _dtypes(self):
+        return {'*': float}
+class PoseBustersEvaluator(AbstractEvaluator):
+    ID = 'posebusters'
+    def __init__(self, pb_conf: str = 'dock'):
+        self.posebusters = PoseBusters(config=pb_conf)
+    @patch('rdkit.RDLogger.EnableLog', lambda x: None)
+    @patch('rdkit.RDLogger.DisableLog', lambda x: None)
+    def evaluate(self, molecule, protein=None):
+        result = safe_run(self.posebusters.bust, timeout=20, mol_pred=molecule, mol_cond=protein)
+        if result is None:
+            return dict()
+        with pd.option_context("future.no_silent_downcasting", True):
+            result = dict(result.fillna(False).iloc[0])
+        result['all'] = all([bool(value) if not is_nan(value) else False for value in result.values()])
+        return result
+    @property
+    def _dtypes(self):
+        return {'*': bool}
+class GeometryEvaluator(AbstractEvaluator):
+    ID = 'geometry'
+    def evaluate(self, molecule, protein=None):
+        mol = self.load_molecule(molecule)
+        data = self.get_distances_and_angles(mol)
+        return data
+    @staticmethod
+    def angle_repr(mol, triplet):
+        i = mol.GetAtomWithIdx(triplet[0]).GetSymbol()
+        j = mol.GetAtomWithIdx(triplet[1]).GetSymbol()
+        k = mol.GetAtomWithIdx(triplet[2]).GetSymbol()
+        ij = BOND_SYMBOLS[mol.GetBondBetweenAtoms(triplet[0], triplet[1]).GetBondType()]
+        jk = BOND_SYMBOLS[mol.GetBondBetweenAtoms(triplet[1], triplet[2]).GetBondType()]
+        # Unified (sorted) representation
+        if i < k:
+            return f'{i}{ij}{j}{jk}{k}'
+        elif i > j:
+            return f'{k}{jk}{j}{ij}{i}'
+        elif ij <= jk:
+            return f'{i}{ij}{j}{jk}{k}'
+        else:
+            return f'{k}{jk}{j}{ij}{i}'
+    @staticmethod
+    def bond_repr(mol, pair):
+        i = mol.GetAtomWithIdx(pair[0]).GetSymbol()
+        j = mol.GetAtomWithIdx(pair[1]).GetSymbol()
+        ij = BOND_SYMBOLS[mol.GetBondBetweenAtoms(pair[0], pair[1]).GetBondType()]
+        # Unified (sorted) representation
+        return f'{i}{ij}{j}' if i <= j else f'{j}{ij}{i}'
+    @staticmethod
+    def get_bond_distances(mol, bonds):
+        i, j = np.array(bonds).T
+        x = mol.GetConformer().GetPositions()
+        xi = x[i]
+        xj = x[j]
+        bond_distances = np.linalg.norm(xi - xj, axis=1)
+        return bond_distances
+    @staticmethod
+    def get_angle_values(mol, triplets):
+        i, j, k = np.array(triplets).T
+        x = mol.GetConformer().GetPositions()
+        xi = x[i]
+        xj = x[j]
+        xk = x[k]
+        vji = xi - xj
+        vjk = xk - xj
+        angles = np.arccos((vji * vjk).sum(axis=1) / (np.linalg.norm(vji, axis=1) * np.linalg.norm(vjk, axis=1)))
+        return np.degrees(angles)
+    @staticmethod
+    def get_distances_and_angles(mol):
+        data = defaultdict(list)
+        bonds = _get_bond_atom_indices(mol)
+        distances = GeometryEvaluator.get_bond_distances(mol, bonds)
+        for b, d in zip(bonds, distances):
+            data[GeometryEvaluator.bond_repr(mol, b)].append(d)
+        triplets = _get_angle_atom_indices(bonds)
+        angles = GeometryEvaluator.get_angle_values(mol, triplets)
+        for t, a in zip(triplets, angles):
+            data[GeometryEvaluator.angle_repr(mol, t)].append(a)
+        return data
+    @property
+    def _dtypes(self):
+        return {'*': list}
+class EnergyEvaluator(AbstractEvaluator):
+    ID = 'energy'
+    def evaluate(self, molecule, protein=None):
+        molecule = self.load_molecule(molecule)
+        try:
+            energy = self.get_energy(molecule)
+        except:
+            energy = None
+        return {'energy': energy}
+    @staticmethod
+    def get_energy(mol, conf_id=-1):
+        mol = Chem.AddHs(mol, addCoords=True)
+        uff = UFFGetMoleculeForceField(mol, confId=conf_id)
+        e_uff = uff.CalcEnergy()
+        return e_uff
+    @property
+    def _dtypes(self):
+        return {'energy': float}
+class InteractionsEvaluator(AbstractEvaluator):
+    ID = 'interactions'
+    def __init__(self, reduce='./reduce'):
+        self.reduce = reduce
+    @property
+    def default_profile(self):
+        return {i: 0 for i in INTERACTION_LIST}
+    def evaluate(self, molecule, protein=None):
+        molecule = self.load_molecule(molecule)
+        profile = self.default_profile
+        try:
+            ligand_plf = prepare_ligand(molecule)
+            protein_plf = read_protein(str(protein), reduce_exec=self.reduce)
+            interactions = profile_detailed(ligand_plf, protein_plf)
+            if not interactions.empty:
+                profile.update(dict(interactions.interaction.value_counts()))
+        except Exception:
+            pass
+        return profile
+    @property
+    def _dtypes(self):
+        return {'*': int}
+class GninaEvalulator(AbstractEvaluator):
+    ID = 'gnina'
+    def __init__(self, gnina):
+        self.gnina = gnina
+    def evaluate(self, molecule, protein=None):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            molecule = self.save_molecule(molecule, sdf_path=Path(tmpdir, 'molecule.sdf'))
+            gnina_cmd = f'{self.gnina} -r {str(protein)} -l {str(molecule)} --minimize --seed 42 --no_gpu'
+            gnina_result = subprocess.run(gnina_cmd, shell=True, capture_output=True, text=True)
+            n_atoms = self.load_molecule(molecule).GetNumAtoms()
+        gnina_scores = self.read_gnina_results(gnina_result)
+        # Additionally computing ligand efficiency
+        gnina_scores['vina_efficiency'] = gnina_scores['vina_score'] / n_atoms if n_atoms > 0 else None
+        gnina_scores['gnina_efficiency'] = gnina_scores['gnina_score'] / n_atoms if n_atoms > 0 else None
+        return gnina_scores
+    @staticmethod
+    def read_gnina_results(gnina_result):
+        res = {
+            'vina_score': None,
+            'gnina_score': None,
+            'minimisation_rmsd': None,
+            'cnn_score': None,
+        }
+        if gnina_result.returncode != 0:
+            print(gnina_result.stderr)
+            return res
+        for line in gnina_result.stdout.split('\n'):
+            if line.startswith('Affinity'):
+                res['vina_score'] = float(line.split(' ')[1].strip())
+            if line.startswith('CNNaffinity'):
+                res['gnina_score'] = float(line.split(' ')[1].strip())
+            if line.startswith('CNNscore'):
+                res['cnn_score'] = float(line.split(' ')[1].strip())
+            if line.startswith('RMSD'):
+                res['minimisation_rmsd'] = float(line.split(' ')[1].strip())
+        return res
+    @property
+    def _dtypes(self):
+        return {'*': float}
+class MedChemEvaluator(AbstractEvaluator):
+    ID = 'medchem'
+    def __init__(self, connectivity_threshold=1.0):
+        self.connectivity_threshold = connectivity_threshold
+    def evaluate(self, molecule, protein=None):
+        molecule = self.load_molecule(molecule)
+        valid = self.is_valid(molecule)
+        if valid:
+            Chem.SanitizeMol(molecule)
+        connected = None if not valid else self.is_connected(molecule)
+        qed = None if not valid else self.calculate_qed(molecule)
+        sa = None if not valid else self.calculate_sa(molecule)
+        logp = None if not valid else self.calculate_logp(molecule)
+        lipinski = None if not valid else self.calculate_lipinski(molecule)
+        n_rotatable_bonds = None if not valid else self.calculate_rotatable_bonds(molecule)
+        size = self.calculate_molecule_size(molecule)
+        return {
+            'valid': valid,
+            'connected': connected,
+            'qed': qed,
+            'sa': sa,
+            'logp': logp,
+            'lipinski': lipinski,
+            'size': size,
+            'n_rotatable_bonds': n_rotatable_bonds,
+        }
+    @staticmethod
+    def is_valid(rdmol):
+        if rdmol.GetNumAtoms() < 1:
+            return False
+        _mol = Chem.Mol(rdmol)
+        try:
+            Chem.SanitizeMol(_mol)
+        except ValueError:
+            return False
+        return True
+    def is_connected(self, rdmol):
+        if rdmol.GetNumAtoms() < 1:
+            return False
+        try:
+            mol_frags = Chem.rdmolops.GetMolFrags(rdmol, asMols=True)
+            largest_frag = max(mol_frags, default=rdmol, key=lambda m: m.GetNumAtoms())
+            return largest_frag.GetNumAtoms() / rdmol.GetNumAtoms() >= self.connectivity_threshold
+        except:
+            return False
+    @staticmethod
+    def calculate_qed(rdmol):
+        try:
+            return QED.qed(rdmol)
+        except:
+            return None
+    @staticmethod
+    def calculate_sa(rdmol):
+        try:
+            sa = calculateScore(rdmol)
+            return sa
+        except:
+            return None
+    @staticmethod
+    def calculate_logp(rdmol):
+        try:
+            return Crippen.MolLogP(rdmol)
+        except:
+            return None
+    @staticmethod
+    def calculate_lipinski(rdmol):
+        try:
+            rule_1 = Descriptors.ExactMolWt(rdmol) < 500
+            rule_2 = Lipinski.NumHDonors(rdmol) <= 5
+            rule_3 = Lipinski.NumHAcceptors(rdmol) <= 10
+            rule_4 = (logp := Crippen.MolLogP(rdmol) >= -2) & (logp <= 5)
+            rule_5 = Chem.rdMolDescriptors.CalcNumRotatableBonds(rdmol) <= 10
+            return np.sum([int(a) for a in [rule_1, rule_2, rule_3, rule_4, rule_5]])
+        except:
+            return None
+    @staticmethod
+    def calculate_molecule_size(rdmol):
+        try:
+            return rdmol.GetNumAtoms()
+        except:
+            return None
+    @staticmethod
+    def calculate_rotatable_bonds(rdmol):
+        try:
+            return Chem.rdMolDescriptors.CalcNumRotatableBonds(rdmol)
+        except:
+            return None
+    @property
+    def _dtypes(self):
+        return {
+            'valid': bool,
+            'connected': bool,
+            'qed': float,
+            'sa': float,
+            'logp': float,
+            'lipinski': int,
+            'size': int,
+            'n_rotatable_bonds': int,
+        }
+class ClashEvaluator(AbstractEvaluator):
+    ID = 'clashes'
+    def __init__(self, margin=0.75, ignore={'H'}):
+        self.margin = margin
+        self.ignore = ignore
+    def evaluate(self, molecule=None, protein=None):
+        result = {
+            'passed_clash_score_ligands': None,
+            'passed_clash_score_pockets': None,
+            'passed_clash_score_between': None,
+        }
+        if molecule is not None:
+            molecule = self.load_molecule(molecule)
+            clash_score = self.clash_score(molecule)
+            result['clash_score_ligands'] = clash_score
+            result['passed_clash_score_ligands'] = (clash_score == 0)
+        if protein is not None:
+            protein = Chem.MolFromPDBFile(str(protein), sanitize=False)
+            clash_score = self.clash_score(protein)
+            result['clash_score_pockets'] = clash_score
+            result['passed_clash_score_pockets'] = (clash_score == 0)
+        if molecule is not None and protein is not None:
+            clash_score = self.clash_score(molecule, protein)
+            result['clash_score_between'] = clash_score
+            result['passed_clash_score_between'] = (clash_score == 0)
+        return result
+    def clash_score(self, rdmol1, rdmol2=None):
+        """
+        Computes a clash score as the number of atoms that have at least one
+        clash divided by the number of atoms in the molecule.
+        INTERMOLECULAR CLASH SCORE
+        If rdmol2 is provided, the score is the percentage of atoms in rdmol1
+        that have at least one clash with rdmol2.
+        We define a clash if two atoms are closer than "margin times the sum of
+        their van der Waals radii".
+        INTRAMOLECULAR CLASH SCORE
+        If rdmol2 is not provided, the score is the percentage of atoms in rdmol1
+        that have at least one clash with other atoms in rdmol1.
+        In this case, a clash is defined by margin times the atoms' smallest
+        covalent radii (among single, double and triple bond radii). This is done
+        so that this function is applicable even if no connectivity information is
+        available.
+        """
+        intramolecular = rdmol2 is None
+        if intramolecular:
+            rdmol2 = rdmol1
+        coord1, radii1 = self.coord_and_radii(rdmol1, intramolecular=intramolecular)
+        coord2, radii2 = self.coord_and_radii(rdmol2, intramolecular=intramolecular)
+        dist = np.sqrt(np.sum((coord1[:, None, :] - coord2[None, :, :]) ** 2, axis=-1))
+        if intramolecular:
+            np.fill_diagonal(dist, np.inf)
+        clashes = dist < self.margin * (radii1[:, None] + radii2[None, :])
+        clashes = np.any(clashes, axis=1)
+        return np.mean(clashes)
+    def coord_and_radii(self, rdmol, intramolecular):
+        _periodic_table = Chem.GetPeriodicTable()
+        _get_radius = _periodic_table.GetRcovalent if intramolecular else _periodic_table.GetRvdw
+        coord = rdmol.GetConformer().GetPositions()
+        radii = np.array([_get_radius(a.GetSymbol()) for a in rdmol.GetAtoms()])
+        mask = np.array([a.GetSymbol() not in self.ignore for a in rdmol.GetAtoms()])
+        coord = coord[mask]
+        radii = radii[mask]
+        assert coord.shape[0] == radii.shape[0]
+        return coord, radii
+    @property
+    def _dtypes(self):
+        return {
+            'clash_score_ligands': float,
+            'clash_score_pockets': float,
+            'clash_score_between': float,
+            'passed_clash_score_ligands': bool,
+            'passed_clash_score_pockets': bool,
+            'passed_clash_score_between': bool,
+        }
+class RingCountEvaluator(AbstractEvaluator):
+    ID = 'ring_count'
+    def evaluate(self, molecule, protein=None):
+        _mol = self.load_molecule(molecule)
+        # compute ring info if not yet available
+        try:
+            _mol.UpdatePropertyCache()
+        except ValueError:
+            return {}
+        Chem.GetSymmSSSR(_mol)
+        rings = _mol.GetRingInfo().AtomRings()
+        ring_sizes = [len(r) for r in rings]
+        ring_counts = defaultdict(int)
+        for k in ring_sizes:
+            ring_counts[f"num_{k}_rings"] += 1
+        return ring_counts
+    @property
+    def _dtypes(self):
+        return {'*': int}
+class ChemblRingEvaluator(AbstractEvaluator):
+    ID = 'chembl_ring_systems'
+    def __init__(self):
+        self.ring_system_lookup = RingSystemLookup.default()  # ChEMBL
+    def evaluate(self, molecule, protein=None):
+        results = {
+            'min_ring_smi': None,
+            'min_ring_freq_gt0_': None,
+            'min_ring_freq_gt10_': None,
+            'min_ring_freq_gt100_': None,
+        }
+        molecule = self.load_molecule(molecule)
+        try:
+            Chem.SanitizeMol(molecule)
+            freq_list = self.ring_system_lookup.process_mol(molecule)
+            freq_list = self.ring_system_lookup.process_mol(molecule)
+        except ValueError:
+            return results
+        min_ring, min_freq = get_min_ring_frequency(freq_list)
+        return {
+            'min_ring_smi': min_ring,
+            'min_ring_freq_gt0_': min_freq > 0,
+            'min_ring_freq_gt10_': min_freq > 10,
+            'min_ring_freq_gt100_': min_freq > 100,
+        }
+    @property
+    def _dtypes(self):
+        return {
+            'min_ring_smi': str,
+            'min_ring_freq_gt0_': bool,
+            'min_ring_freq_gt10_': bool,
+            'min_ring_freq_gt100_': bool,
+        }
+class REOSEvaluator(AbstractEvaluator):
+    # Based on https://practicalcheminformatics.blogspot.com/2024/05/generative-molecular-design-isnt-as.html
+    ID = 'reos'
+    def __init__(self):
+        self.reos = REOS()
+    def evaluate(self, molecule, protein=None):
+        molecule = self.load_molecule(molecule)
+        try:
+            Chem.SanitizeMol(molecule)
+        except ValueError:
+            return {rule_set: False for rule_set in self.reos.get_available_rule_sets()}
+        results = {}
+        for rule_set in self.reos.get_available_rule_sets():
+            self.reos.set_active_rule_sets([rule_set])
+            if rule_set == 'PW':
+                self.reos.drop_rule('furans')
+            reos_res = self.reos.process_mol(molecule)
+            results[rule_set] = reos_res[0] == 'ok'
+        results['all'] = all([bool(value) if not is_nan(value) else False for value in results.values()])
+        return results
+    @property
+    def _dtypes(self):
+        return {'*': bool}
+class FullEvaluator(AbstractEvaluator):
+    def __init__(
+            self,
+            pb_conf: str = 'dock',
+            gnina: Optional[Union[Path, str]] = None,
+            reduce: Optional[Union[Path, str]] = None,
+            connectivity_threshold: float = 1.0,
+            margin: float = 0.75,
+            ignore: Set[str] = {'H'},
+            exclude_evaluators: Collection[str] = [],
+    ):
+        all_evaluators = [
+            RepresentationEvaluator(),
+            MolPropertyEvaluator(),
+            PoseBustersEvaluator(pb_conf=pb_conf),
+            MedChemEvaluator(connectivity_threshold=connectivity_threshold),
+            ClashEvaluator(margin=margin, ignore=ignore),
+            GeometryEvaluator(),
+            RingCountEvaluator(),
+            EnergyEvaluator(),
+            ChemblRingEvaluator(),
+            REOSEvaluator()
+        ]
+        if gnina is not None:
+            all_evaluators.append(GninaEvalulator(gnina=gnina))
+        else:
+            print(f'Evaluator [{GninaEvalulator.ID}] is not included')
+        if reduce is not None:
+            all_evaluators.append(InteractionsEvaluator(reduce=reduce))
+        else:
+            print(f'Evaluator [{InteractionsEvaluator.ID}] is not included')
+        self.evaluators = []
+        for e in all_evaluators:
+            if e.ID in exclude_evaluators:
+                print(f'Excluded Evaluator [{e.ID}]')
+            else:
+                self.evaluators.append(e)
+        print('Will use the following evaluators:')
+        for e in self.evaluators:
+            print(f'- [{e.ID}]')
+    def evaluate(self, molecule, protein):
+        results = {}
+        for evaluator in self.evaluators:
+            results.update(evaluator(molecule, protein))
+        return results
+    @property
+    def _dtypes(self):
+        all_dtypes = {}
+        for evaluator in self.evaluators:
+            all_dtypes.update(evaluator.dtypes)
+        return all_dtypes
+########################################################################################
+################################# Collection Metrics ###################################
+########################################################################################
+class AbstractCollectionEvaluator:
+    ID = None
+    def __call__(self, smiles: Collection[str], timeout=300):
+        """
+        Args:
+            smiles (Collection[smiles]): input list of SMILES
+        Returns:
+            metrics (dict): dictionary of metrics
+        """
+        if self.ID is not None:
+            print(f'Running CollectionEvaluator [{self.ID}]')
+        RDLogger.DisableLog('rdApp.*')
+        self.check_format(smiles)
+        # timeout handler
+        signal.signal(signal.SIGALRM, timeout_handler)
+        try:
+            signal.alarm(timeout)
+            results = self.evaluate(smiles)
+        except TimeoutError:
+            print(f'Error when evaluating [{self.ID}]: Timeout after {timeout} seconds')
+            signal.alarm(0)
+            return {}
+        except Exception as e:
+            print(f'Error when evaluating [{self.ID}]: {e}')
+            signal.alarm(0)
+            return {}
+        finally:
+            print(f'Finished CollectionEvaluator [{self.ID}]')
+            signal.alarm(0)
+        return results
+    @staticmethod
+    def check_format(smiles):
+        assert len(smiles) > 0, 'List of input SMILES cannot be empty'
+        assert isinstance(smiles, Collection), 'Only list of SMILES supported'
+        assert isinstance(smiles[0], str), 'Only list of SMILES supported'
+class UniquenessEvaluator(AbstractCollectionEvaluator):
+    ID = 'uniqueness'
+    def evaluate(self, smiles: Collection[str]):
+        uniqueness = len(set(smiles)) / len(smiles)
+        return {'uniqueness': uniqueness}
+class NoveltyEvaluator(AbstractCollectionEvaluator):
+    ID = 'novelty'
+    def __init__(self, reference_smiles: Collection[str]):
+        self.reference_smiles = set(list(reference_smiles))
+        assert len(self.reference_smiles) > 0, 'List of refernce SMILES cannot be empty'
+    def evaluate(self, smiles: Collection[str]):
+        smiles = set(smiles)
+        novel = [smi for smi in smiles if smi not in self.reference_smiles]
+        novelty = len(novel) / len(smiles)
+        return {'novelty': novelty}
+def canonical_smiles(smiles):
+    for smi in smiles:
+        try:
+            mol = Chem.MolFromSmiles(smi)
+            if mol is not None:
+                yield Chem.MolToSmiles(mol)
+        except:
+            yield None
+class FCDEvaluator(AbstractCollectionEvaluator):
+    ID = 'fcd'
+    def __init__(self, reference_smiles: Collection[str]):
+        self.reference_smiles = list(reference_smiles)
+        assert len(self.reference_smiles) > 0, 'List of refernce SMILES cannot be empty'
+    def evaluate(self, smiles: Collection[str]):
+        if len(smiles) > len(self.reference_smiles):
+            print('Number of reference molecules should be greater than number of input molecules')
+            return {'fcd': None}
+        np.random.seed(42)
+        reference_smiles = np.random.choice(self.reference_smiles, len(smiles), replace=False).tolist()
+        reference_smiles_canonical = [w for w in canonical_smiles(reference_smiles) if w is not None]
+        smiles_canonical = [w for w in canonical_smiles(smiles) if w is not None]
+        fcd = get_fcd(reference_smiles_canonical, smiles_canonical)
+        return {'fcd': fcd}
+class RingDistributionEvaluator(AbstractCollectionEvaluator):
+    ID = 'ring_system_distribution'
+    def __init__(self, reference_smiles: Collection[str], jsd_on_k_most_freq: Collection[int] = ()):
+        self.ring_system_finder = RingSystemFinder()
+        self.ref_ring_dict = self.compute_ring_dict(reference_smiles)
+        self.jsd_on_k_most_freq = jsd_on_k_most_freq
+    def compute_ring_dict(self, molecules):
+        ring_system_dict = defaultdict(int)
+        for mol in tqdm(molecules, desc="Computing ring systems"):
+            if isinstance(mol, str):
+                mol = Chem.MolFromSmiles(mol)
+            try:
+                ring_system_list = self.ring_system_finder.find_ring_systems(mol, as_mols=True)
+            except ValueError:
+                print(f"WARNING[{type(self).__name__}]: error while computing ring systems; skipping molecule.")
+                continue
+            for ring in ring_system_list:
+                inchi_key = Chem.MolToInchiKey(ring)
+                ring_system_dict[inchi_key] += 1
+        return ring_system_dict
+    def precision(self, query_ring_dict):
+        query_ring_systems = set(query_ring_dict.keys())
+        ref_ring_systems = set(self.ref_ring_dict.keys())
+        intersection = ref_ring_systems & query_ring_systems
+        return len(intersection) / len(query_ring_systems) if len(query_ring_systems) > 0 else 0
+    def recall(self, query_ring_dict):
+        query_ring_systems = set(query_ring_dict.keys())
+        ref_ring_systems = set(self.ref_ring_dict.keys())
+        intersection = ref_ring_systems & query_ring_systems
+        return len(intersection) / len(ref_ring_systems) if len(ref_ring_systems) > 0 else 0
+    def jsd(self, query_ring_dict, k_most_freq=None):
+        if k_most_freq is None:
+            # example on the union of all ring systems
+            sample_space = set(self.ref_ring_dict.keys()) | set(query_ring_dict.keys())
+        else:
+            # evaluate only on the k most common rings from the reference set
+            sorted_rings = [k for k, v in sorted(self.ref_ring_dict.items(), key=lambda item: item[1], reverse=True)]
+            sample_space = sorted_rings[:k_most_freq]
+        p = np.zeros(len(sample_space))
+        q = np.zeros(len(sample_space))
+        for i, inchi_key in enumerate(sample_space):
+            p[i] = self.ref_ring_dict.get(inchi_key, 0)
+            q[i] = query_ring_dict.get(inchi_key, 0)
+        # normalize
+        p = p / np.sum(p)
+        q = q / np.sum(q)
+        return jensenshannon(p, q)
+    def evaluate(self, smiles: Collection[str]):
+        query_ring_dict = self.compute_ring_dict(smiles)
+        out = {
+            "precision": self.precision(query_ring_dict),
+            "recall": self.recall(query_ring_dict),
+            "jsd": self.jsd(query_ring_dict),
+        }
+        out.update(
+            {f"jsd_{k}_most_freq": self.jsd(query_ring_dict, k_most_freq=k) for k in self.jsd_on_k_most_freq}
+        )
+        return out
+class FullCollectionEvaluator(AbstractCollectionEvaluator):
+    def __init__(self, reference_smiles: Collection[str], exclude_evaluators: Collection[str] = []):
+        self.evaluators = [
+            UniquenessEvaluator(),
+            NoveltyEvaluator(reference_smiles=reference_smiles),
+            FCDEvaluator(reference_smiles=reference_smiles),
+            RingDistributionEvaluator(reference_smiles, jsd_on_k_most_freq=[10, 100, 1000, 10000]),
+        ]
+        for e in self.evaluators:
+            if e.ID in exclude_evaluators:
+                print(f'Excluding CollectionEvaluator [{e.ID}]')
+                self.evaluators.remove(e)
+    def evaluate(self, smiles):
+        results = {}
+        for evaluator in self.evaluators:
+            results.update(evaluator(smiles))
+        return results

src/sbdd_metrics/sascorer.py ADDED Viewed

	@@ -0,0 +1,173 @@

+#
+# calculation of synthetic accessibility score as described in:
+#
+# Estimation of Synthetic Accessibility Score of Drug-like Molecules based on Molecular Complexity and Fragment Contributions
+# Peter Ertl and Ansgar Schuffenhauer
+# Journal of Cheminformatics 1:8 (2009)
+# http://www.jcheminf.com/content/1/1/8
+#
+# several small modifications to the original paper are included
+# particularly slightly different formula for marocyclic penalty
+# and taking into account also molecule symmetry (fingerprint density)
+#
+# for a set of 10k diverse molecules the agreement between the original method
+# as implemented in PipelinePilot and this implementation is r2 = 0.97
+#
+# peter ertl & greg landrum, september 2013
+#
+from rdkit import Chem
+from rdkit.Chem import rdMolDescriptors
+import pickle
+import math
+from collections import defaultdict
+import os.path as op
+_fscores = None
+def readFragmentScores(name='fpscores'):
+    import gzip
+    global _fscores
+    # generate the full path filename:
+    if name == "fpscores":
+        name = op.join(op.dirname(__file__), name)
+    data = pickle.load(gzip.open('%s.pkl.gz' % name))
+    outDict = {}
+    for i in data:
+        for j in range(1, len(i)):
+            outDict[i[j]] = float(i[0])
+    _fscores = outDict
+def numBridgeheadsAndSpiro(mol, ri=None):
+    nSpiro = rdMolDescriptors.CalcNumSpiroAtoms(mol)
+    nBridgehead = rdMolDescriptors.CalcNumBridgeheadAtoms(mol)
+    return nBridgehead, nSpiro
+def calculateScore(m):
+    if _fscores is None:
+        readFragmentScores()
+    # fragment score
+    fp = rdMolDescriptors.GetMorganFingerprint(m,
+                                               2)  # <- 2 is the *radius* of the circular fingerprint
+    fps = fp.GetNonzeroElements()
+    score1 = 0.
+    nf = 0
+    for bitId, v in fps.items():
+        nf += v
+        sfp = bitId
+        score1 += _fscores.get(sfp, -4) * v
+    score1 /= nf
+    # features score
+    nAtoms = m.GetNumAtoms()
+    nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
+    ri = m.GetRingInfo()
+    nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
+    nMacrocycles = 0
+    for x in ri.AtomRings():
+        if len(x) > 8:
+            nMacrocycles += 1
+    sizePenalty = nAtoms**1.005 - nAtoms
+    stereoPenalty = math.log10(nChiralCenters + 1)
+    spiroPenalty = math.log10(nSpiro + 1)
+    bridgePenalty = math.log10(nBridgeheads + 1)
+    macrocyclePenalty = 0.
+    # ---------------------------------------
+    # This differs from the paper, which defines:
+    #  macrocyclePenalty = math.log10(nMacrocycles+1)
+    # This form generates better results when 2 or more macrocycles are present
+    if nMacrocycles > 0:
+        macrocyclePenalty = math.log10(2)
+    score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
+    # correction for the fingerprint density
+    # not in the original publication, added in version 1.1
+    # to make highly symmetrical molecules easier to synthetise
+    score3 = 0.
+    if nAtoms > len(fps):
+        score3 = math.log(float(nAtoms) / len(fps)) * .5
+    sascore = score1 + score2 + score3
+    # need to transform "raw" value into scale between 1 and 10
+    min = -4.0
+    max = 2.5
+    sascore = 11. - (sascore - min + 1) / (max - min) * 9.
+    # smooth the 10-end
+    if sascore > 8.:
+        sascore = 8. + math.log(sascore + 1. - 9.)
+    if sascore > 10.:
+        sascore = 10.0
+    elif sascore < 1.:
+        sascore = 1.0
+    return sascore
+def processMols(mols):
+    print('smiles\tName\tsa_score')
+    for i, m in enumerate(mols):
+        if m is None:
+            continue
+        s = calculateScore(m)
+        smiles = Chem.MolToSmiles(m)
+        print(smiles + "\t" + m.GetProp('_Name') + "\t%3f" % s)
+if __name__ == '__main__':
+    import sys
+    import time
+    t1 = time.time()
+    readFragmentScores("fpscores")
+    t2 = time.time()
+    suppl = Chem.SmilesMolSupplier(sys.argv[1])
+    t3 = time.time()
+    processMols(suppl)
+    t4 = time.time()
+    print('Reading took %.2f seconds. Calculating took %.2f seconds' % ((t2 - t1), (t4 - t3)),
+          file=sys.stderr)
+#
+#  Copyright (c) 2013, Novartis Institutes for BioMedical Research Inc.
+#  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+#       copyright notice, this list of conditions and the following
+#       disclaimer in the documentation and/or other materials provided
+#       with the distribution.
+#     * Neither the name of Novartis Institutes for BioMedical Research Inc.
+#       nor the names of its contributors may be used to endorse or promote
+#       products derived from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#