doammii commited on Mar 17

Commit

55d9b0c

•

1 Parent(s): f9b9173

Add LlaMol codes

Browse files

Files changed (39) hide show

.gitignore +183 -0
LICENSE +437 -0
README.md +162 -5
assets/llamol.png +0 -0
config/config.yaml +2 -0
config/train/llama2-Debug.yaml +47 -0
config/train/llama2-DebugGPU.yaml +46 -0
config/train/llama2-M-Full-BRICKS.yaml +46 -0
config/train/llama2-M-Full-RSS.yaml +46 -0
config/train/llama2-M-Full.yaml +46 -0
data/Full_PC9_GAP.parquet +3 -0
data/RedDB_Full.parquet +3 -0
data/chembl_log_sascore.parquet +3 -0
data/combine_all.py +164 -0
data/opv/prepare_opv.py +265 -0
data/pubchemqc2020_energy.parquet +3 -0
data/pubchemqc_energy.parquet +3 -0
data/qm9_zinc250k_cep/convert_to_parquet.py +41 -0
data/qm9_zinc250k_cep/qm9_zinc250_cep.parquet +3 -0
data/vocab.txt +612 -0
data/zinc/convert_to_parquet.py +67 -0
data/zinc/zinc_complete/download_zinc.sh +300 -0
data/zinc/zinc_complete/run_download.py +21 -0
demonstrator.ipynb +521 -0
fragment_creator.py +136 -0
generate_paper_graphs.sh +19 -0
get_fragment_table.sh +42 -0
model.py +787 -0
out/llama2-M-Full-RSS.pt +3 -0
plot_utils.py +513 -0
preprocess_dataset.py +370 -0
requirements.txt +8 -0
sample.py +616 -0
tokenizer.py +404 -0
torch2-env.yaml +29 -0
train.py +101 -0
trainLLamaMol.sh +19 -0
trainLLamaMolDDPSingleNode.sh +28 -0
trainer.py +513 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,183 @@

+*.out
+debug
+debug-gpu
+outputs
+chemiscope_gen.json
+gen_smiles.txt
+__pycache__
+*.png
+*.csv
+*.json
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+data/opv/download
+data/opv/opv.parquet
+data/qm9_zinc250k_cep/zinc_properties.csv
+data/qm9_zinc250k_cep/qm9_zinc250k_cep.parquet
+data/zinc/zinc_complete/*/*.txt
+!data/zinc/zinc_complete/download_zinc.sh
+!data/zinc/zinc_complete/run_download.py
+data/zinc/zinc_processed
+data/zinc/zinc_processed.parquet
+data/zinc/zinc_full.parquet
+data/OrganiX13.parquet
+.cache
+out/plots
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+!assets/*.png

LICENSE ADDED Viewed

	@@ -0,0 +1,437 @@

+Attribution-NonCommercial-ShareAlike 4.0 International
+=======================================================================
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+Using Creative Commons Public Licenses
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+=======================================================================
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+Section 1 -- Definitions.
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+Section 2 -- Scope.
+  a. License grant.
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+       5. Downstream recipients.
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+  b. Other rights.
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+Section 3 -- License Conditions.
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+  a. Attribution.
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+                ii. a copyright notice;
+               iii. a notice that refers to this Public License;
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+  b. ShareAlike.
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+Section 4 -- Sui Generis Database Rights.
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+Section 6 -- Term and Termination.
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+       2. upon express reinstatement by the Licensor.
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+Section 7 -- Other Terms and Conditions.
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+Section 8 -- Interpretation.
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+=======================================================================
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+Creative Commons may be contacted at creativecommons.org.

README.md CHANGED Viewed

@@ -1,5 +1,162 @@
----
-license: other
-license_name: attribution-noncommercial-share-alike4.0international
-license_link: https://creativecommons.org/licenses/by-nc-sa/4.0/deed.en
----

+# Llamol
+<p align="center">
+  <img src="assets/llamol.png" width="300" height="300" alt="LLamol">
+</p>
+This is the official repository for the paper ["LLamol: A Dynamic Multi-Conditional Generative Transformer for De Novo Molecular Design"](https://arxiv.org/abs/2311.14407).
+In this repository are the weights for LLamol (`out/llama2-M-Full-RSS.pt`) and the dataset OrganiX13.
+Image made with [Hotspot.ai](https://hotpot.ai/art-generator)
+## Installation
+Install using Mamba to be fast: https://mamba.readthedocs.io/en/latest/micromamba-installation.html
+```bash
+$ "${SHELL}" <(curl -L micro.mamba.pm/install.sh)
+$ micromamba env create -f torch2-env.yaml
+$ micromamba activate torch2-llamol
+$ python sample.py
+```
+# Download and preprocess the OrganiX13 dataset:
+If you want to train with the full 13 Million dataset do the following steps. These are *not* necessary if you just want to use the model for inference:
+1. Download and preprocess the OPV dataset by running `/data/opv/prepare_opv.py`
+2. Download and preprocess the ZINC dataset by running `/data/zinc/zinc_complete/run_download.py` followed by `/data/zinc/convert_to_parquet.py`
+(we recommend at least 16GB RAM for this)
+3. Download and preprocess the ZINC dataset by running `/data/qm9_zinc250k_cep/convert_to_parquet.py`
+4. Run `data/combine_all.py` to combine the dataset to `data/OrganiX13.parquet` (this can take a while, especially on the zinc dataset. In total it took ~2 hours when using my Laptop, which has 16 GB ram and an Intel i7 10th Gen)
+5. Run `preprocess_dataset.py` which should create the file `.cache/processed_dataset_None.pkl`
+Now you can use that in the training of the model by specifing the file under the `processed_dataset_ckpt` of the training .yaml files.
+# Interactive Demo
+After installation you can play around with the model using the `demonstrator.ipynb` file. Just run all and scroll down to the last cell.
+After a short time there should be a UI where you can play around with the model.
+## Training
+First the env needs to be activated so:
+```bash
+$ conda activate torch2-llamol # When installed with conda instead of micromamba
+OR
+$ micromamba activate torch2-llamol
+``````
+To train locally you can run:
+```bash
+# To set the config that you want to train with
+$ python train.py train=llama2-M-Full-RSS
+```
+Parameters can also be overriden by using the following, for example:
+```bash
+$ python train.py train=llama2-M-Full-RSS train.model.dim=1024
+```
+For more information look at [Hydra](https://hydra.cc/docs/1.3/intro/)
+To start a job on a SLURM cluster use the following script:
+```bash
+$ sbatch trainLLamaMol.sh
+``````
+## Training Multi-GPU on 1 Node with multiple GPUS (nproc_per_node)
+```bash
+torchrun --standalone --max_restarts=3  --nnodes=1 --nproc_per_node=2 --rdzv-backend=c10d  --rdzv-endpoint="$localhost:12345" train.py train=llama2-M-Full-RSS > "train_runs/run_MultiGPU.out"
+```
+## Training Multi-GPU on 1 Node with multiple GPUS on a Cluster
+Currently there is only one script to train with DDP. To change the number of GPUS in that script you have to change the bash script itself.
+TODO: Make it more dynamic, with allowing console commands to change the number of GPUS etc.
+```bash
+sbatch trainLLamaMolDDPSingleNode.sh
+```
+## Sampling
+Sampling can be changed by the OPTIONAL parameters as shown below.
+```bash
+$ python sample.py --help
+$ python sample.py --num_samples 2000 --ckpt_path "out/llama2-M-Full-RSS.pt"  --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --seed 4312 --context_cols logp sascore mol_weight --temperature 0.8
+```
+## Using own dataset
+Use the `preprocess_dataset.py` file to tokenize the dataset. The dataset should be either in the parquet or csv format.
+The SMILES used for training should be in the `smiles` column in the dataset. All conditions, should be given to the pretokenize function.
+After the preprocessing is done a file should be stored in the .cache directory with the name `processed_dataset_{limit}.pkl`.
+You could also rename this file to not overwrite it every time you run the preprocessing.
+The `.cache/processed_dataset_{limit}.pkl` can then be set in the `config/train/llama2-M-Full-RSS.yaml file` to change the training with the new dataset in the `processed_dataset_ckpt` field in the yaml file.
+# Training methods
+The training method we used and described in the paper is here called RSS for "Random Smiles Sampling" which was the method then described in the "Stochastic Context Learning" as taking a random subsequence from the current SMILES while training and feeding that into the model as a token sequence condition. So the model we used in the paper was the `out/llama2-M-Full-RSS.pt`.
+We also tried other approached for including the token sequence.
+One was using murcko scaffolds as they were used in the MolGPT paper, but this approach did not yield great results for our purposes.
+The other was using BRICKS decomposition, which also did not yield very good results.
+The different methods are implemented in the `fragment_creator.py` file.
+Each of the models were trained with their respective configurations in the `config/train` folder.
+# Thanks
+- [Karpathy](https://github.com/karpathy/llama2.c) for the implementation of the Llama 2 architecture and training code
+- [DeepChem](https://github.com/deepchem/deepchem) for the SmilesTokenizer
+- [TorchDrug](https://github.com/DeepGraphLearning/torchdrug/) for the downloads scripts for the OPV and CEP datasets
+- Zinc 15 dataset (Teague Sterling and John J. Irwin. ZINC 15 – ligand discovery for everyone. Journal of Chemical Information
+and Modeling, 55(11):2324–2337, November 2015.)
+- QM9 dataset (
+  Raghunathan Ramakrishnan, Pavlo O. Dral, Matthias Rupp, and O. Anatole von Lilienfeld. Quantum chemistry
+  structures and properties of 134 kilo molecules. Scientific Data, 1(1), aug 2014.)
+- PC9 dataset (Marta Glavatskikh, Jules Leguy, Gilles Hunault, Thomas Cauchy, and Benoit Da Mota. Dataset’s chemical
+diversity limits the generalizability of machine learning predictions. Journal of Cheminformatics, 11(1), nov 2019)
+- ZINC 250k (Rafael Gó mez-Bombarelli, Jennifer N. Wei, David Duvenaud, José Miguel Hernández-Lobato, Benjamín
+Sánchez-Lengeling, Dennis Sheberla, Jorge Aguilera-Iparraguirre, Timothy D. Hirzel, Ryan P. Adams, and Alán
+Aspuru-Guzik. Automatic chemical design using a data-driven continuous representation of molecules. ACS
+Central Science, 4(2):268–276, jan 2018.)
+- RedDB (Elif Sorkun, Qi Zhang, Abhishek Khetan, Murat Cihan Sorkun, and Süleyman Er. RedDB, a computational
+database of electroactive molecules for aqueous redox flow batteries. Scientific Data, 9(1), nov 2022.)
+- OPV (Peter C. St. John, Caleb Phillips, Travis W. Kemper, A. Nolan Wilson, Yanfei Guan, Michael F. Crowley, Mark R.
+Nimlos, and Ross E. Larsen. Message-passing neural networks for high-throughput polymer screening. The
+Journal of Chemical Physics, 150(23):234111, jun 2019.)
+- PubchemQC 2020 (Maho Nakata, Tomomi Shimazaki, Masatomo Hashimoto, and Toshiyuki Maeda. PubChemQC PM6: Data sets
+of 221 million molecules with optimized molecular geometries and electronic properties. Journal of Chemical
+Information and Modeling, 60(12):5891–5899, oct 2020.)
+- PubchemQC 2017 (Maho Nakata and Tomomi Shimazaki. PubChemQC project: A large-scale first-principles electronic structure
+database for data-driven chemistry. Journal of Chemical Information and Modeling, 57(6):1300–1308, may 2017.)
+- CEP (Johannes Hachmann, Roberto Olivares-Amaya, Sule Atahan-Evrenk, Carlos Amador-Bedolla, Roel S. Sánchez-
+Carrera, Aryeh Gold-Parker, Leslie Vogt, Anna M. Brockway, and Alán Aspuru-Guzik. The Harvard clean energy
+project: Large-scale computational screening and design of organic photovoltaics on the world community grid.
+The Journal of Physical Chemistry Letters, 2(17):2241–2251, aug 2011.) subset ( David Duvenaud, Dougal Maclaurin, Jorge Aguilera-Iparraguirre, Rafael Gómez-Bombarelli, Timothy Hirzel,
+Alán Aspuru-Guzik, and Ryan P. Adams. Convolutional networks on graphs for learning molecular fingerprints,
+2015.)
+- ChEMBL (James Blackshaw, Anna Gaulton, A. Patrícia Bento, Marleen De Veij, David Mendez Lopez, Nicolas Bosc, Juan
+Felipe Mosquera Morales, María Paula Margariños, Andrew Leach, Emma Manners, Barbara Zdrazil, Harris
+Ioannidis, Fiona Hunter, Eloy Félix, and Ricardo Arcila Toro. CHEMBL database release 31, September 2009.)
+# Funding disclaimer
+This project has received funding from the European Union’s Horizon 2020 research and innovation programme under Grant Agreement no. 875489.
+This website reflects only the author’s view. The funding agency is not responsible for any use made of the information it contains.
+# License
+ <p xmlns:cc="http://creativecommons.org/ns#" xmlns:dct="http://purl.org/dc/terms/"><span property="dct:title">LLamol is licensed under <a href="http://creativecommons.org/licenses/by-nc-sa/4.0/?ref=chooser-v1" target="_blank" rel="license noopener noreferrer" style="display:inline-block;">CC BY-NC-SA 4.0<img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/cc.svg?ref=chooser-v1"><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/by.svg?ref=chooser-v1"><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/nc.svg?ref=chooser-v1"><img style="height:22px!important;margin-left:3px;vertical-align:text-bottom;" src="https://mirrors.creativecommons.org/presskit/icons/sa.svg?ref=chooser-v1"></a></p>

assets/llamol.png ADDED Viewed

config/config.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ defaults:
2	+ - train: "llama2-Debug"

config/train/llama2-Debug.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+io:
+  # I/O
+  out_dir : "debug"
+  eval_interval : 10
+  log_interval : 10
+  eval_iters : 5
+  eval_only : false  # if True, script exits right after the first eval
+  always_save_checkpoint : true  # if True, always save a checkpoint after each eval
+  init_from : "scratch"  # 'scratch' or 'resume'
+  resume_when_snapshot_available: false
+loader:
+  batch_size : 4 # if gradient_accumulation_steps > 1, this is the micro-batch size
+  max_seq_len : 768
+  dataset : "smiles"
+  processed_dataset_ckpt : "processed_dataset_500000.pkl"
+  fragment_creator : "rss"
+model:
+  dim : 32
+  n_layers : 1
+  n_heads : 1
+  multiple_of : 16
+  dropout : 0.1
+context:
+  context_keys: ["logp", "sascore", "mol_weight"]
+  context_dims : [1,1,1]
+optimizer:
+  gradient_accumulation_steps : 4  # used to simulate larger batch sizes
+  learning_rate : 1e-4  # max learning rate
+  max_iters : 20  # total number of training iterations
+  weight_decay : 1e-1
+  beta1 : 0.9
+  beta2 : 0.95
+  grad_clip : 1.0  # clip gradients at this value, or disable if == 0.0
+  # learning rate decay settings
+  decay_lr : true  # whether to decay the learning rate
+  warmup_iters : 10  # how many steps to warm up for
+  lr_decay_iters : 100  # should be ~= max_iters per Chinchilla
+  min_lr : 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+dtype : "float16" # Use float16 for training, could also be changed to float32 or bfloat16
+compile : false # Use torch.compile, but in my test this is really slow
+label : "llama2-Debug"
+profile : false

config/train/llama2-DebugGPU.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+io:
+  # I/O
+  out_dir : "debug-gpu"
+  eval_interval : 10
+  log_interval : 10
+  eval_iters : 5
+  eval_only : false  # if True, script exits right after the first eval
+  always_save_checkpoint : true  # if True, always save a checkpoint after each eval
+  init_from : "scratch"  # 'scratch' or 'resume'
+  resume_when_snapshot_available: false
+loader:
+  batch_size : 256 # if gradient_accumulation_steps > 1, this is the micro-batch size
+  max_seq_len : 256
+  dataset : "smiles"
+  processed_dataset_ckpt : "processed_dataset_500000.pkl"
+model:
+  dim : 256
+  n_layers : 8
+  n_heads : 8
+  multiple_of : 128
+  dropout : 0.1
+context:
+  context_keys: ["logp", "sascore", "mol_weight"]
+  context_dims : [1,1,1]
+optimizer:
+  gradient_accumulation_steps : 4  # used to simulate larger batch sizes
+  learning_rate : 1e-4  # max learning rate
+  max_iters : 25  # total number of training iterations
+  weight_decay : 1e-1
+  beta1 : 0.9
+  beta2 : 0.95
+  grad_clip : 1.0  # clip gradients at this value, or disable if == 0.0
+  # learning rate decay settings
+  decay_lr : true  # whether to decay the learning rate
+  warmup_iters : 10  # how many steps to warm up for
+  lr_decay_iters : 100  # should be ~= max_iters per Chinchilla
+  min_lr : 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+dtype : "float16" # Use float16 for training, could also be changed to float32 or bfloat16
+compile : false # Use torch.compile, but in my test this is really slow
+label : "llama2-Debug"
+profile: true # Profile the run

config/train/llama2-M-Full-BRICKS.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+io:
+  # I/O
+  out_dir : "out"
+  eval_interval : 500
+  log_interval : 10
+  eval_iters : 10
+  eval_only : false  # if True, script exits right after the first eval
+  always_save_checkpoint : false  # if True, always save a checkpoint after each eval
+  init_from : "scratch"  # 'scratch' or 'resume'
+  resume_when_snapshot_available: true
+loader:
+  batch_size : 384 # if gradient_accumulation_steps > 1, this is the micro-batch size
+  max_seq_len : 768
+  dataset : "smiles"
+  processed_dataset_ckpt : "processed_dataset_None.pkl"
+  fragment_creator : "bricks"
+model:
+  dim : 256
+  n_layers : 8
+  n_heads : 8
+  multiple_of : 128
+  dropout : 0.1
+context:
+  context_keys: ["logp", "sascore", "mol_weight"]
+  context_dims : [1,1,1]
+optimizer:
+  gradient_accumulation_steps : 4  # used to simulate larger batch sizes
+  learning_rate : 1e-4  # max learning rate
+  max_iters : 100000  # total number of training iterations
+  weight_decay : 1e-1
+  beta1 : 0.9
+  beta2 : 0.95
+  grad_clip : 1.0  # clip gradients at this value, or disable if == 0.0
+  # learning rate decay settings
+  decay_lr : true  # whether to decay the learning rate
+  warmup_iters : 1000  # how many steps to warm up for
+  lr_decay_iters : 100000  # should be ~= max_iters per Chinchilla
+  min_lr : 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+dtype : "float16" # Use float16 for training, could also be changed to float32 or bfloat16
+compile : false # Use torch.compile, but in my test this is really slow
+label : "llama2-M-Full-BRICKS"

config/train/llama2-M-Full-RSS.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+io:
+  # I/O
+  out_dir : "out"
+  eval_interval : 500
+  log_interval : 10
+  eval_iters : 10
+  eval_only : false  # if True, script exits right after the first eval
+  always_save_checkpoint : false  # if True, always save a checkpoint after each eval
+  init_from : "scratch"  # 'scratch' or 'resume'
+  resume_when_snapshot_available: true # resume the training always, when the `snapshot_` is available in the out/ folder
+loader:
+  batch_size : 256 # if gradient_accumulation_steps > 1, this is the micro-batch size
+  max_seq_len : 256 # the maximum sequence length we want to use in the training data.
+  dataset : "smiles"
+  processed_dataset_ckpt : "processed_dataset_None.pkl"
+  fragment_creator : "rss" # the method we want to use to train with the token_sequence
+model:
+  dim : 384
+  n_layers : 8
+  n_heads : 8
+  multiple_of : 128
+  dropout : 0.1
+context:
+  context_keys: ["logp", "sascore", "mol_weight"]
+  context_dims : [1,1,1]
+optimizer:
+  gradient_accumulation_steps : 4  # used to simulate larger batch sizes
+  learning_rate : 1e-4  # max learning rate
+  max_iters : 100000  # total number of training iterations
+  weight_decay : 1e-1
+  beta1 : 0.9
+  beta2 : 0.95
+  grad_clip : 1.0  # clip gradients at this value, or disable if == 0.0
+  # learning rate decay settings
+  decay_lr : true  # whether to decay the learning rate
+  warmup_iters : 1000  # how many steps to warm up for
+  lr_decay_iters : 100000  # should be ~= max_iters per Chinchilla
+  min_lr : 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+dtype : "float16" # Use float16 for training, could also be changed to float32 or bfloat16
+compile : false # Use torch.compile, but in my test this is really slow
+label : "llama2-M-Full-RSS" # the name of the output file / model

config/train/llama2-M-Full.yaml ADDED Viewed

	@@ -0,0 +1,46 @@

+io:
+  # I/O
+  out_dir : "out"
+  eval_interval : 500
+  log_interval : 10
+  eval_iters : 10
+  eval_only : false  # if True, script exits right after the first eval
+  always_save_checkpoint : false  # if True, always save a checkpoint after each eval
+  init_from : "scratch"  # 'scratch' or 'resume'
+  resume_when_snapshot_available: true
+loader:
+  batch_size : 384 # if gradient_accumulation_steps > 1, this is the micro-batch size
+  max_seq_len : 768
+  dataset : "smiles"
+  processed_dataset_ckpt : "processed_dataset_None.pkl"
+  fragment_creator : null
+model:
+  dim : 256
+  n_layers : 8
+  n_heads : 8
+  multiple_of : 128
+  dropout : 0.1
+context:
+  context_keys: ["logp", "sascore", "mol_weight"]
+  context_dims : [1,1,1]
+optimizer:
+  gradient_accumulation_steps : 4  # used to simulate larger batch sizes
+  learning_rate : 1e-4  # max learning rate
+  max_iters : 100000  # total number of training iterations
+  weight_decay : 1e-1
+  beta1 : 0.9
+  beta2 : 0.95
+  grad_clip : 1.0  # clip gradients at this value, or disable if == 0.0
+  # learning rate decay settings
+  decay_lr : true  # whether to decay the learning rate
+  warmup_iters : 1000  # how many steps to warm up for
+  lr_decay_iters : 100000  # should be ~= max_iters per Chinchilla
+  min_lr : 0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+dtype : "float16" # Use float16 for training, could also be changed to float32 or bfloat16
+compile : false # Use torch.compile, but in my test this is really slow
+label : "llama2-M-Full"

data/Full_PC9_GAP.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e1c1932284e5987ff997675b3f8ad2a8763c4dc864315e78a774841fb6b6791
+size 38893336

data/RedDB_Full.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:543e98ba1b622a2a949a3818d047daa478658d2d91923a291907a2d9c8c886bd
+size 1024066

data/chembl_log_sascore.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30d04f6f1f01caec6164d85b23ba1282dfe63ec1b245e4c358aa216831c32ee8
+size 99582099

data/combine_all.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import pandas as pd
+import numpy as np
+import os
+from rdkit import Chem
+from rdkit.Chem import Descriptors
+import multiprocessing
+from rdkit import Chem
+from rdkit.Chem import RDConfig
+import os
+import sys
+sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
+# now you can import sascore!
+import sascorer
+np.random.seed(42)
+def calcLogPIfMol(smi):
+    m = Chem.MolFromSmiles(smi)
+    if m is not None:
+        return Descriptors.MolLogP(m)
+    else:
+        return None
+def calcMol(smi):
+    return Chem.MolFromSmiles(smi)
+def calcMolWeight(smi):
+    mol = Chem.MolFromSmiles(smi)
+    return Descriptors.ExactMolWt(mol)
+def calcSascore(smi):
+    mol = Chem.MolFromSmiles(smi)
+    return sascorer.calculateScore(mol)
+def calculateValues(smi: pd.Series):
+    with multiprocessing.Pool(8) as pool:
+        print("Starting logps")
+        logps = pool.map(calcLogPIfMol, smi)
+        print("Done logps")
+        valid_mols = ~pd.isna(logps)
+        logps = pd.Series(logps)[valid_mols]
+        smi = pd.Series(smi)[valid_mols]
+        logps.reset_index(drop=True,inplace=True)
+        smi.reset_index(drop=True,inplace=True)
+        print("Starting mol weights")
+        mol_weights = pool.map(calcMolWeight, smi)
+        print("Done mol weights")
+        print("Starting sascores")
+        sascores = pool.map(calcSascore, smi)
+        print("Done sascores")
+    return smi, logps, mol_weights,sascores
+def calculateProperties(df):
+    smi, logps, mol_weights,sascores = calculateValues(df["smiles"])
+    out_df = pd.DataFrame({"smiles": smi, "logp":logps, "mol_weight":mol_weights, "sascore":sascores })
+    return out_df
+if __name__ == "__main__":
+    cwd = os.path.dirname(__file__)
+    print("df_pc9")
+    df_pc9 = pd.read_parquet(os.path.join(cwd, "Full_PC9_GAP.parquet"))
+    df_pc9 = calculateProperties(df_pc9)
+    print("df_zinc_full")
+    df_zinc_full = pd.read_parquet(
+        os.path.join(cwd, "zinc", "zinc_processed.parquet")
+    )
+    df_zinc_full = df_zinc_full.sample(n=5_000_000)
+    df_zinc_full = calculateProperties(df_zinc_full)
+    print("df_zinc_qm9")
+    df_zinc_qm9 = pd.read_parquet(os.path.join(cwd,"qm9_zinc250k_cep", "qm9_zinc250_cep.parquet"))
+    df_zinc_qm9 = calculateProperties(df_zinc_qm9)
+    print("df_opv")
+    df_opv = pd.read_parquet(os.path.join(cwd,"opv", "opv.parquet"))
+    df_opv = calculateProperties(df_opv)
+    print("df_reddb")
+    # Source: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/F3QFSQ
+    df_reddb = pd.read_parquet(os.path.join(cwd,"RedDB_Full.parquet"))
+    df_reddb = calculateProperties(df_reddb)
+    print("df_chembl")
+    df_chembl = pd.read_parquet(
+        os.path.join(cwd, "chembl_log_sascore.parquet")
+    )
+    df_chembl = calculateProperties(df_chembl)
+    print("df_pubchemqc_2017")
+    df_pubchemqc_2017 = pd.read_parquet(
+        os.path.join(cwd, "pubchemqc_energy.parquet")
+    )
+    df_pubchemqc_2017 = calculateProperties(df_pubchemqc_2017)
+    print("df_pubchemqc_2020")
+    df_pubchemqc_2020 = pd.read_parquet(
+        os.path.join(cwd, "pubchemqc2020_energy.parquet")
+    )
+    df_pubchemqc_2020 = calculateProperties(df_pubchemqc_2020)
+    df_list = [
+        df_zinc_qm9,
+        df_opv,
+        df_pubchemqc_2017,
+        df_pubchemqc_2020,
+        df_zinc_full,
+        df_reddb,
+        df_pc9,
+        df_chembl,
+    ]
+    print(f"ZINC QM9 {len(df_zinc_qm9)}")
+    print(f"df_opv {len(df_opv)}")
+    print(f"df_pubchemqc_2017 {len(df_pubchemqc_2017)}")
+    print(f"df_pubchemqc_2020 {len(df_pubchemqc_2020)}")
+    print(f"df_zinc_full {len(df_zinc_full)}")
+    print(f"df_reddb {len(df_reddb)}")
+    print(f"df_pc9 {len(df_pc9)}")
+    print(f"df_chembl {len(df_chembl)}")
+    all_columns = [
+        "smiles",
+        "logp",
+        "sascore",
+        "mol_weight"
+    ]  # set([*df_zinc_qm9.columns.tolist(),*df_pubchemqc_2017.columns.tolist(),*df_pubchemqc_2020.columns.tolist(),*df_zinc_full.columns.tolist()] )
+    print("concatenting")
+    df = pd.concat(
+        df_list, axis=0, ignore_index=True
+    )  # pd.DataFrame(columns=all_columns)
+    df = df[all_columns]  # .fillna(0)
+    # df = df.sample(n=7_500_000)
+    df.reset_index(drop=True, inplace=True)
+    df["mol_weight"] = df["mol_weight"] / 100.0
+    print(df.head())
+    print("saving")
+    print("Combined len:", len(df))
+    df.to_parquet(
+        os.path.join(cwd, "OrganiX13.parquet")
+    )

data/opv/prepare_opv.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import math
+import os
+import struct
+import logging
+from tqdm import tqdm
+import csv
+from collections import defaultdict
+import pandas as pd
+logger = logging.getLogger(__name__)
+# Taken from here https://torchdrug.ai/docs/_modules/torchdrug/utils/file.html#download
+def download(url, path, save_file=None, md5=None):
+    """
+    Download a file from the specified url.
+    Skip the downloading step if there exists a file satisfying the given MD5.
+    Parameters:
+        url (str): URL to download
+        path (str): path to store the downloaded file
+        save_file (str, optional): name of save file. If not specified, infer the file name from the URL.
+        md5 (str, optional): MD5 of the file
+    """
+    from six.moves.urllib.request import urlretrieve
+    if save_file is None:
+        save_file = os.path.basename(url)
+        if "?" in save_file:
+            save_file = save_file[:save_file.find("?")]
+    save_file = os.path.join(path, save_file)
+    if not os.path.exists(save_file) or compute_md5(save_file) != md5:
+        logger.info("Downloading %s to %s" % (url, save_file))
+        urlretrieve(url, save_file)
+    return save_file
+def smart_open(file_name, mode="rb"):
+    """
+    Open a regular file or a zipped file.
+    This function can be used as drop-in replacement of the builtin function `open()`.
+    Parameters:
+        file_name (str): file name
+        mode (str, optional): open mode for the file stream
+    """
+    import bz2
+    import gzip
+    extension = os.path.splitext(file_name)[1]
+    if extension == '.bz2':
+        return bz2.BZ2File(file_name, mode)
+    elif extension == '.gz':
+        return gzip.GzipFile(file_name, mode)
+    else:
+        return open(file_name, mode)
+def extract(zip_file, member=None):
+    """
+    Extract files from a zip file. Currently, ``zip``, ``gz``, ``tar.gz``, ``tar`` file types are supported.
+    Parameters:
+        zip_file (str): file name
+        member (str, optional): extract specific member from the zip file.
+            If not specified, extract all members.
+    """
+    import gzip
+    import shutil
+    import zipfile
+    import tarfile
+    zip_name, extension = os.path.splitext(zip_file)
+    if zip_name.endswith(".tar"):
+        extension = ".tar" + extension
+        zip_name = zip_name[:-4]
+    save_path = os.path.dirname(zip_file)
+    if extension == ".gz":
+        member = os.path.basename(zip_name)
+        members = [member]
+        save_files = [os.path.join(save_path, member)]
+        for _member, save_file in zip(members, save_files):
+            with open(zip_file, "rb") as fin:
+                fin.seek(-4, 2)
+                file_size = struct.unpack("<I", fin.read())[0]
+            with gzip.open(zip_file, "rb") as fin:
+                if not os.path.exists(save_file) or file_size != os.path.getsize(save_file):
+                    logger.info("Extracting %s to %s" % (zip_file, save_file))
+                    with open(save_file, "wb") as fout:
+                        shutil.copyfileobj(fin, fout)
+    elif extension in [".tar.gz", ".tgz", ".tar"]:
+        tar = tarfile.open(zip_file, "r")
+        if member is not None:
+            members = [member]
+            save_files = [os.path.join(save_path, os.path.basename(member))]
+            logger.info("Extracting %s from %s to %s" % (member, zip_file, save_files[0]))
+        else:
+            members = tar.getnames()
+            save_files = [os.path.join(save_path, _member) for _member in members]
+            logger.info("Extracting %s to %s" % (zip_file, save_path))
+        for _member, save_file in zip(members, save_files):
+            if tar.getmember(_member).isdir():
+                os.makedirs(save_file, exist_ok=True)
+                continue
+            os.makedirs(os.path.dirname(save_file), exist_ok=True)
+            if not os.path.exists(save_file) or tar.getmember(_member).size != os.path.getsize(save_file):
+                with tar.extractfile(_member) as fin, open(save_file, "wb") as fout:
+                    shutil.copyfileobj(fin, fout)
+    elif extension == ".zip":
+        zipped = zipfile.ZipFile(zip_file)
+        if member is not None:
+            members = [member]
+            save_files = [os.path.join(save_path, os.path.basename(member))]
+            logger.info("Extracting %s from %s to %s" % (member, zip_file, save_files[0]))
+        else:
+            members = zipped.namelist()
+            save_files = [os.path.join(save_path, _member) for _member in members]
+            logger.info("Extracting %s to %s" % (zip_file, save_path))
+        for _member, save_file in zip(members, save_files):
+            if zipped.getinfo(_member).is_dir():
+                os.makedirs(save_file, exist_ok=True)
+                continue
+            os.makedirs(os.path.dirname(save_file), exist_ok=True)
+            if not os.path.exists(save_file) or zipped.getinfo(_member).file_size != os.path.getsize(save_file):
+                with zipped.open(_member, "r") as fin, open(save_file, "wb") as fout:
+                    shutil.copyfileobj(fin, fout)
+    else:
+        raise ValueError("Unknown file extension `%s`" % extension)
+    if len(save_files) == 1:
+        return save_files[0]
+    else:
+        return save_path
+def compute_md5(file_name, chunk_size=65536):
+    """
+    Compute MD5 of the file.
+    Parameters:
+        file_name (str): file name
+        chunk_size (int, optional): chunk size for reading large files
+    """
+    import hashlib
+    md5 = hashlib.md5()
+    with open(file_name, "rb") as fin:
+        chunk = fin.read(chunk_size)
+        while chunk:
+            md5.update(chunk)
+            chunk = fin.read(chunk_size)
+    return md5.hexdigest()
+def get_line_count(file_name, chunk_size=8192*1024):
+    """
+    Get the number of lines in a file.
+    Parameters:
+        file_name (str): file name
+        chunk_size (int, optional): chunk size for reading large files
+    """
+    count = 0
+    with open(file_name, "rb") as fin:
+        chunk = fin.read(chunk_size)
+        while chunk:
+            count += chunk.count(b"\n")
+            chunk = fin.read(chunk_size)
+    return count
+class OPV:
+    """
+    Quantum mechanical calculations on organic photovoltaic candidate molecules.
+    Statistics:
+        - #Molecule: 94,576
+        - #Regression task: 8
+    Parameters:
+        path (str): path to store the dataset
+        verbose (int, optional): output verbose level
+        **kwargs
+    """
+    train_url = "https://cscdata.nrel.gov/api/datasets/ad5d2c9a-af0a-4d72-b943-1e433d5750d6/download/" \
+                "b69cf9a5-e7e0-405b-88cb-40df8007242e"
+    valid_url = "https://cscdata.nrel.gov/api/datasets/ad5d2c9a-af0a-4d72-b943-1e433d5750d6/download/" \
+                "1c8e7379-3071-4360-ba8e-0c6481c33d2c"
+    test_url = "https://cscdata.nrel.gov/api/datasets/ad5d2c9a-af0a-4d72-b943-1e433d5750d6/download/" \
+               "4ef40592-0080-4f00-9bb7-34b25f94962a"
+    train_md5 = "16e439b7411ea0a8d3a56ba4802b61b1"
+    valid_md5 = "3aa2ac62015932ca84661feb5d29adda"
+    test_md5 = "bad072224f0755478f0729476ca99a33"
+    target_fields = ["gap", "homo", "lumo", "spectral_overlap", "gap_extrapolated", "homo_extrapolated",
+                     "lumo_extrapolated", "optical_lumo_extrapolated"]
+    def read_csv(self, csv_file, smiles_field="smiles", target_fields=None, verbose=0):
+        if target_fields is not None:
+            target_fields = set(target_fields)
+        with open(csv_file, "r") as fin:
+            reader = csv.reader(fin)
+            if verbose:
+                reader = iter(tqdm(reader, "Loading %s" % csv_file, get_line_count(csv_file)))
+            fields = next(reader)
+            smiles = []
+            targets = defaultdict(list)
+            for i, values in enumerate(reader):
+                if not any(values):
+                    continue
+                if smiles_field is None:
+                    smiles.append("")
+                for field, value in zip(fields, values):
+                    if field == smiles_field:
+                        smiles.append(value)
+                    elif target_fields is None or field in target_fields:
+                        pass
+                        # value = eval(value)
+                        # if value == "":
+                        #     value = math.nan
+                        # targets[field].append(value)
+        return smiles, targets
+    def __init__(self, path, verbose=1, **kwargs):
+        path = os.path.expanduser(path)
+        if not os.path.exists(path):
+            os.makedirs(path)
+        self.path = path
+        train_zip_file = download(self.train_url, path, save_file="mol_train.csv.gz", md5=self.train_md5)
+        valid_zip_file = download(self.valid_url, path, save_file="mol_valid.csv.gz", md5=self.valid_md5)
+        test_zip_file = download(self.test_url, path, save_file="mol_test.csv.gz", md5=self.test_md5)
+        train_file = extract(train_zip_file)
+        valid_file = extract(valid_zip_file)
+        test_file = extract(test_zip_file)
+        train_smiles, train_targets = self.read_csv(train_file, smiles_field="smile", target_fields=self.target_fields)
+        valid_smiles, valid_targets = self.read_csv(valid_file, smiles_field="smile", target_fields=self.target_fields)
+        test_smiles, test_targets = self.read_csv(test_file, smiles_field="smile", target_fields=self.target_fields)
+        self.num_train = len(train_smiles)
+        self.num_valid = len(valid_smiles)
+        self.num_test = len(test_smiles)
+        smiles = train_smiles + valid_smiles + test_smiles
+        targets = {k: train_targets[k] + valid_targets[k] + test_targets[k] for k in train_targets}
+        # self.load_smiles(smiles, targets, verbose=verbose, **kwargs)
+        print(smiles[:10])
+        df_out = pd.DataFrame({"smiles": smiles})
+        df_out.to_parquet(os.path.join(os.path.dirname(__file__), "opv.parquet"))
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    cwd = os.path.join(os.path.dirname(__file__), "download")
+    os.makedirs(cwd,exist_ok=True)
+    d = OPV(cwd)

data/pubchemqc2020_energy.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d5ef9f419a48be52b1fe6332eb08d77df0b6ff7ec34f8c99c06e63fa232abf1
+size 39165769

data/pubchemqc_energy.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5ca78b6f81f04ddcc2ed6e031d86f0a2f1e38d6c4001bfd93a28005b7168cf8
+size 89749991

data/qm9_zinc250k_cep/convert_to_parquet.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import pandas as pd
+import requests
+import hashlib
+import os
+# Download and read zinc_properties file
+zinc_url = "https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv"
+zinc_md5 = "b59078b2b04c6e9431280e3dc42048d5"
+zinc_filename = "zinc_properties.csv"
+response = requests.get(zinc_url)
+downloaded_data = response.content
+downloaded_md5 = hashlib.md5(downloaded_data).hexdigest()
+if zinc_md5 == downloaded_md5:
+    with open(zinc_filename, 'wb') as f:
+        f.write(downloaded_data)
+    print(f"File '{zinc_filename}' downloaded and saved.")
+else:
+    raise ValueError("MD5 checksum does not match")
+zinc_df = pd.read_csv(zinc_filename)
+zinc_df = zinc_df[["smiles"]]
+cwd = os.path.dirname(__file__)
+qm9_filename = os.path.join(cwd,"QM9IsoFull.csv")
+cep_filename = os.path.join(cwd,"cep-processed.csv")
+qm9_df = pd.read_csv(qm9_filename, sep="|")
+qm9_df = qm9_df[["smiles"]]
+cep_df = pd.read_csv(cep_filename)
+cep_df = cep_df[["smiles"]]
+# Combine the dataframes into one large dataframe
+combined_df = pd.concat([zinc_df, qm9_df, cep_df], axis=0)
+# Save the combined dataframe to a Parquet file
+output_filename = "qm9_zinc250_cep.parquet"
+combined_df.to_parquet(output_filename, index=False)
+print(f"Combined dataframe saved to '{output_filename}' as Parquet file.")

data/qm9_zinc250k_cep/qm9_zinc250_cep.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3003c48cff3793646f07692b85745786d4d9b103323b3b59b3ae5b23af071d3a
+size 7580076

data/vocab.txt ADDED Viewed

	@@ -0,0 +1,612 @@

+[PAD]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+c
+C
+(
+)
+O
+1
+2
+=
+N
+.
+n
+3
+F
+Cl
+>>
+~
+-
+4
+[C@H]
+S
+[C@@H]
+[O-]
+Br
+#
+/
+[nH]
+[N+]
+s
+5
+o
+P
+[Na+]
+[Si]
+I
+[Na]
+[Pd]
+[K+]
+[K]
+[P]
+B
+[C@]
+[C@@]
+[Cl-]
+6
+[OH-]
+\
+[N-]
+[Li]
+[H]
+[2H]
+[NH4+]
+[c-]
+[P-]
+[Cs+]
+[Li+]
+[Cs]
+[NaH]
+[H-]
+[O+]
+[BH4-]
+[Cu]
+7
+[Mg]
+[Fe+2]
+[n+]
+[Sn]
+[BH-]
+[Pd+2]
+[CH]
+[I-]
+[Br-]
+[C-]
+[Zn]
+[B-]
+[F-]
+[Al]
+[P+]
+[BH3-]
+[Fe]
+[C]
+[AlH4]
+[Ni]
+[SiH]
+8
+[Cu+2]
+[Mn]
+[AlH]
+[nH+]
+[AlH4-]
+[O-2]
+[Cr]
+[Mg+2]
+[NH3+]
+[S@]
+[Pt]
+[Al+3]
+[S@@]
+[S-]
+[Ti]
+[Zn+2]
+[PH]
+[NH2+]
+[Ru]
+[Ag+]
+[S+]
+[I+3]
+[NH+]
+[Ca+2]
+[Ag]
+9
+[Os]
+[Se]
+[SiH2]
+[Ca]
+[Ti+4]
+[Ac]
+[Cu+]
+[S]
+[Rh]
+[Cl+3]
+[cH-]
+[Zn+]
+[O]
+[Cl+]
+[SH]
+[H+]
+[Pd+]
+[se]
+[PH+]
+[I]
+[Pt+2]
+[C+]
+[Mg+]
+[Hg]
+[W]
+[SnH]
+[SiH3]
+[Fe+3]
+[NH]
+[Mo]
+[CH2+]
+%10
+[CH2-]
+[CH2]
+[n-]
+[Ce+4]
+[NH-]
+[Co]
+[I+]
+[PH2]
+[Pt+4]
+[Ce]
+[B]
+[Sn+2]
+[Ba+2]
+%11
+[Fe-3]
+[18F]
+[SH-]
+[Pb+2]
+[Os-2]
+[Zr+4]
+[N]
+[Ir]
+[Bi]
+[Ni+2]
+[P@]
+[Co+2]
+[s+]
+[As]
+[P+3]
+[Hg+2]
+[Yb+3]
+[CH-]
+[Zr+2]
+[Mn+2]
+[CH+]
+[In]
+[KH]
+[Ce+3]
+[Zr]
+[AlH2-]
+[OH2+]
+[Ti+3]
+[Rh+2]
+[Sb]
+[S-2]
+%12
+[P@@]
+[Si@H]
+[Mn+4]
+p
+[Ba]
+[NH2-]
+[Ge]
+[Pb+4]
+[Cr+3]
+[Au]
+[LiH]
+[Sc+3]
+[o+]
+[Rh-3]
+%13
+[Br]
+[Sb-]
+[S@+]
+[I+2]
+[Ar]
+[V]
+[Cu-]
+[Al-]
+[Te]
+[13c]
+[13C]
+[Cl]
+[PH4+]
+[SiH4]
+[te]
+[CH3-]
+[S@@+]
+[Rh+3]
+[SH+]
+[Bi+3]
+[Br+2]
+[La]
+[La+3]
+[Pt-2]
+[N@@]
+[PH3+]
+[N@]
+[Si+4]
+[Sr+2]
+[Al+]
+[Pb]
+[SeH]
+[Si-]
+[V+5]
+[Y+3]
+[Re]
+[Ru+]
+[Sm]
+*
+[3H]
+[NH2]
+[Ag-]
+[13CH3]
+[OH+]
+[Ru+3]
+[OH]
+[Gd+3]
+[13CH2]
+[In+3]
+[Si@@]
+[Si@]
+[Ti+2]
+[Sn+]
+[Cl+2]
+[AlH-]
+[Pd-2]
+[SnH3]
+[B+3]
+[Cu-2]
+[Nd+3]
+[Pb+3]
+[13cH]
+[Fe-4]
+[Ga]
+[Sn+4]
+[Hg+]
+[11CH3]
+[Hf]
+[Pr]
+[Y]
+[S+2]
+[Cd]
+[Cr+6]
+[Zr+3]
+[Rh+]
+[CH3]
+[N-3]
+[Hf+2]
+[Th]
+[Sb+3]
+%14
+[Cr+2]
+[Ru+2]
+[Hf+4]
+[14C]
+[Ta]
+[Tl+]
+[B+]
+[Os+4]
+[PdH2]
+[Pd-]
+[Cd+2]
+[Co+3]
+[S+4]
+[Nb+5]
+[123I]
+[c+]
+[Rb+]
+[V+2]
+[CH3+]
+[Ag+2]
+[cH+]
+[Mn+3]
+[Se-]
+[As-]
+[Eu+3]
+[SH2]
+[Sm+3]
+[IH+]
+%15
+[OH3+]
+[PH3]
+[IH2+]
+[SH2+]
+[Ir+3]
+[AlH3]
+[Sc]
+[Yb]
+[15NH2]
+[Lu]
+[sH+]
+[Gd]
+[18F-]
+[SH3+]
+[SnH4]
+[TeH]
+[Si@@H]
+[Ga+3]
+[CaH2]
+[Tl]
+[Ta+5]
+[GeH]
+[Br+]
+[Sr]
+[Tl+3]
+[Sm+2]
+[PH5]
+%16
+[N@@+]
+[Au+3]
+[C-4]
+[Nd]
+[Ti+]
+[IH]
+[N@+]
+[125I]
+[Eu]
+[Sn+3]
+[Nb]
+[Er+3]
+[123I-]
+[14c]
+%17
+[SnH2]
+[YH]
+[Sb+5]
+[Pr+3]
+[Ir+]
+[N+3]
+[AlH2]
+[19F]
+%18
+[Tb]
+[14CH]
+[Mo+4]
+[Si+]
+[BH]
+[Be]
+[Rb]
+[pH]
+%19
+%20
+[Xe]
+[Ir-]
+[Be+2]
+[C+4]
+[RuH2]
+[15NH]
+[U+2]
+[Au-]
+%21
+%22
+[Au+]
+[15n]
+[Al+2]
+[Tb+3]
+[15N]
+[V+3]
+[W+6]
+[14CH3]
+[Cr+4]
+[ClH+]
+b
+[Ti+6]
+[Nd+]
+[Zr+]
+[PH2+]
+[Fm]
+[N@H+]
+[RuH]
+[Dy+3]
+%23
+[Hf+3]
+[W+4]
+[11C]
+[13CH]
+[Er]
+[124I]
+[LaH]
+[F]
+[siH]
+[Ga+]
+[Cm]
+[GeH3]
+[IH-]
+[U+6]
+[SeH+]
+[32P]
+[SeH-]
+[Pt-]
+[Ir+2]
+[se+]
+[U]
+[F+]
+[BH2]
+[As+]
+[Cf]
+[ClH2+]
+[Ni+]
+[TeH3]
+[SbH2]
+[Ag+3]
+%24
+[18O]
+[PH4]
+[Os+2]
+[Na-]
+[Sb+2]
+[V+4]
+[Ho+3]
+[68Ga]
+[PH-]
+[Bi+2]
+[Ce+2]
+[Pd+3]
+[99Tc]
+[13C@@H]
+[Fe+6]
+[c]
+[GeH2]
+[10B]
+[Cu+3]
+[Mo+2]
+[Cr+]
+[Pd+4]
+[Dy]
+[AsH]
+[Ba+]
+[SeH2]
+[In+]
+[TeH2]
+[BrH+]
+[14cH]
+[W+]
+[13C@H]
+[AsH2]
+[In+2]
+[N+2]
+[N@@H+]
+[SbH]
+[60Co]
+[AsH4+]
+[AsH3]
+[18OH]
+[Ru-2]
+[Na-2]
+[CuH2]
+[31P]
+[Ti+5]
+[35S]
+[P@@H]
+[ArH]
+[Co+]
+[Zr-2]
+[BH2-]
+[131I]
+[SH5]
+[VH]
+[B+2]
+[Yb+2]
+[14C@H]
+[211At]
+[NH3+2]
+[IrH]
+[IrH2]
+[Rh-]
+[Cr-]
+[Sb+]
+[Ni+3]
+[TaH3]
+[Tl+2]
+[64Cu]
+[Tc]
+[Cd+]
+[1H]
+[15nH]
+[AlH2+]
+[FH+2]
+[BiH3]
+[Ru-]
+[Mo+6]
+[AsH+]
+[BaH2]
+[BaH]
+[Fe+4]
+[229Th]
+[Th+4]
+[As+3]
+[NH+3]
+[P@H]
+[Li-]
+[7NaH]
+[Bi+]
+[PtH+2]
+[p-]
+[Re+5]
+[NiH]
+[Ni-]
+[Xe+]
+[Ca+]
+[11c]
+[Rh+4]
+[AcH]
+[HeH]
+[Sc+2]
+[Mn+]
+[UH]
+[14CH2]
+[SiH4+]
+[18OH2]
+[Ac-]
+[Re+4]
+[118Sn]
+[153Sm]
+[P+2]
+[9CH]
+[9CH3]
+[Y-]
+[NiH2]
+[Si+2]
+[Mn+6]
+[ZrH2]
+[C-2]
+[Bi+5]
+[24NaH]
+[Fr]
+[15CH]
+[Se+]
+[At]
+[P-3]
+[124I-]
+[CuH2-]
+[Nb+4]
+[Nb+3]
+[MgH]
+[Ir+4]
+[67Ga+3]
+[67Ga]
+[13N]
+[15OH2]
+[2NH]
+[Ho]
+[Cn]
+[0*]
+[1*]
+[2*]
+[3*]
+[4*]
+[5*]
+[6*]
+[7*]
+[8*]
+[9*]
+[10*]
+[11*]
+[12*]
+[13*]
+[14*]
+[15*]
+[16*]
+[17*]
+[18*]
+[19*]
+[20*]

data/zinc/convert_to_parquet.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import pandas as pd
+import os.path as osp
+import os
+from tqdm import tqdm
+import dask.dataframe as dd
+import pandas as pd
+import pyarrow as pa
+import shutil
+cwd = osp.abspath(osp.dirname(__file__))
+zinc_path = os.path.join(cwd, "zinc_complete")
+alls_dirs = [
+    osp.join(zinc_path, f)
+    for f in os.listdir(zinc_path)
+    if osp.isdir(osp.join(zinc_path, f))
+]
+print("Number of dirs: ", len(alls_dirs))
+all_dfs = []
+for d in alls_dirs:
+    print(f"Read: {d    }")
+    df = dd.read_csv(
+        os.path.join(cwd, "zinc_complete", f"{d}/*.txt"),
+        sep="\t",
+        usecols=["smiles"],
+    )
+    all_dfs.append(df)
+concatenated_df = dd.concat(all_dfs)
+# res = df["logp"].map_partitions(lambda d, bins: pd.cut(d, bins), 25).compute()
+# print(res)
+print("Writing")
+# print(df)
+# name_function = lambda x: f"zincfull-{x}.parquet"
+concatenated_df = concatenated_df.repartition(npartitions=1)
+concatenated_df = concatenated_df.reset_index(drop=True)
+concatenated_df.to_parquet(
+    os.path.join(cwd, "zinc_processed"),
+)
+print("Done Writing")
+print(len(concatenated_df))
+shutil.copy(
+    os.path.join(cwd, "zinc_processed", "part.0.parquet"),
+    os.path.join(cwd, "zinc_processed.parquet")
+)
+# df = None
+# for d in tqdm(alls_dirs):
+#     if df is not None:
+#         print(len(df))
+#     files = [osp.join(d,f) for f in os.listdir(d)]
+#     for f in files:
+#         try:
+#             df_extra = pd.read_csv(f,sep="\t")
+#         except Exception as e:
+#             print(f"Got error {f}: {e}")
+#             continue
+#         # print(df)
+#         if df is None:
+#             df = df_extra
+#         else:
+#             df = df.append(df_extra)
+# df.to_parquet(osp.join(cwd, "zinc_combined.parquet"))

data/zinc/zinc_complete/download_zinc.sh ADDED Viewed

	@@ -0,0 +1,300 @@

+mkdir -pv AA && wget http://files.docking.org/2D/AA/AAAA.txt -O AA/AAAA.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AAAB.txt -O AA/AAAB.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AAAC.txt -O AA/AAAC.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AAAD.txt -O AA/AAAD.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AABA.txt -O AA/AABA.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AABB.txt -O AA/AABB.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AABC.txt -O AA/AABC.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AABD.txt -O AA/AABD.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AACA.txt -O AA/AACA.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AACB.txt -O AA/AACB.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AACC.txt -O AA/AACC.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AACD.txt -O AA/AACD.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AAEA.txt -O AA/AAEA.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AAEB.txt -O AA/AAEB.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AAEC.txt -O AA/AAEC.txt
+mkdir -pv AA && wget http://files.docking.org/2D/AA/AAED.txt -O AA/AAED.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BAAA.txt -O BA/BAAA.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BAAB.txt -O BA/BAAB.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BAAC.txt -O BA/BAAC.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BAAD.txt -O BA/BAAD.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BABA.txt -O BA/BABA.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BABB.txt -O BA/BABB.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BABC.txt -O BA/BABC.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BABD.txt -O BA/BABD.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BACA.txt -O BA/BACA.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BACB.txt -O BA/BACB.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BACC.txt -O BA/BACC.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BACD.txt -O BA/BACD.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BAEA.txt -O BA/BAEA.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BAEB.txt -O BA/BAEB.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BAEC.txt -O BA/BAEC.txt
+mkdir -pv BA && wget http://files.docking.org/2D/BA/BAED.txt -O BA/BAED.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CAAA.txt -O CA/CAAA.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CAAB.txt -O CA/CAAB.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CAAC.txt -O CA/CAAC.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CAAD.txt -O CA/CAAD.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CABA.txt -O CA/CABA.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CABB.txt -O CA/CABB.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CABC.txt -O CA/CABC.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CABD.txt -O CA/CABD.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CACA.txt -O CA/CACA.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CACB.txt -O CA/CACB.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CACC.txt -O CA/CACC.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CACD.txt -O CA/CACD.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CAEA.txt -O CA/CAEA.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CAEB.txt -O CA/CAEB.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CAEC.txt -O CA/CAEC.txt
+mkdir -pv CA && wget http://files.docking.org/2D/CA/CAED.txt -O CA/CAED.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DAAA.txt -O DA/DAAA.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DAAB.txt -O DA/DAAB.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DAAC.txt -O DA/DAAC.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DAAD.txt -O DA/DAAD.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DABA.txt -O DA/DABA.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DABB.txt -O DA/DABB.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DABC.txt -O DA/DABC.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DABD.txt -O DA/DABD.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DACA.txt -O DA/DACA.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DACB.txt -O DA/DACB.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DACC.txt -O DA/DACC.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DACD.txt -O DA/DACD.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DAEA.txt -O DA/DAEA.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DAEB.txt -O DA/DAEB.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DAEC.txt -O DA/DAEC.txt
+mkdir -pv DA && wget http://files.docking.org/2D/DA/DAED.txt -O DA/DAED.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EAAA.txt -O EA/EAAA.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EAAB.txt -O EA/EAAB.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EAAC.txt -O EA/EAAC.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EAAD.txt -O EA/EAAD.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EABA.txt -O EA/EABA.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EABB.txt -O EA/EABB.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EABC.txt -O EA/EABC.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EABD.txt -O EA/EABD.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EACA.txt -O EA/EACA.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EACB.txt -O EA/EACB.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EACC.txt -O EA/EACC.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EACD.txt -O EA/EACD.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EAEA.txt -O EA/EAEA.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EAEB.txt -O EA/EAEB.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EAEC.txt -O EA/EAEC.txt
+mkdir -pv EA && wget http://files.docking.org/2D/EA/EAED.txt -O EA/EAED.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FAAA.txt -O FA/FAAA.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FAAB.txt -O FA/FAAB.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FAAC.txt -O FA/FAAC.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FAAD.txt -O FA/FAAD.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FABA.txt -O FA/FABA.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FABB.txt -O FA/FABB.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FABC.txt -O FA/FABC.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FABD.txt -O FA/FABD.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FACA.txt -O FA/FACA.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FACB.txt -O FA/FACB.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FACC.txt -O FA/FACC.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FACD.txt -O FA/FACD.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FAEA.txt -O FA/FAEA.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FAEB.txt -O FA/FAEB.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FAEC.txt -O FA/FAEC.txt
+mkdir -pv FA && wget http://files.docking.org/2D/FA/FAED.txt -O FA/FAED.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GAAA.txt -O GA/GAAA.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GAAB.txt -O GA/GAAB.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GAAC.txt -O GA/GAAC.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GAAD.txt -O GA/GAAD.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABAA.txt -O AB/ABAA.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABAB.txt -O AB/ABAB.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABAC.txt -O AB/ABAC.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABAD.txt -O AB/ABAD.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABBA.txt -O AB/ABBA.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABBB.txt -O AB/ABBB.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABBC.txt -O AB/ABBC.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABBD.txt -O AB/ABBD.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABCA.txt -O AB/ABCA.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABCB.txt -O AB/ABCB.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABCC.txt -O AB/ABCC.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABCD.txt -O AB/ABCD.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABEA.txt -O AB/ABEA.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABEB.txt -O AB/ABEB.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABEC.txt -O AB/ABEC.txt
+mkdir -pv AB && wget http://files.docking.org/2D/AB/ABED.txt -O AB/ABED.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBAA.txt -O BB/BBAA.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBAB.txt -O BB/BBAB.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBAC.txt -O BB/BBAC.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBAD.txt -O BB/BBAD.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBBA.txt -O BB/BBBA.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBBB.txt -O BB/BBBB.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBBC.txt -O BB/BBBC.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBBD.txt -O BB/BBBD.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GABA.txt -O GA/GABA.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GABB.txt -O GA/GABB.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GABC.txt -O GA/GABC.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GABD.txt -O GA/GABD.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GACA.txt -O GA/GACA.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GACB.txt -O GA/GACB.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GACC.txt -O GA/GACC.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GACD.txt -O GA/GACD.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GAEA.txt -O GA/GAEA.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GAEB.txt -O GA/GAEB.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GAEC.txt -O GA/GAEC.txt
+mkdir -pv GA && wget http://files.docking.org/2D/GA/GAED.txt -O GA/GAED.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HAAA.txt -O HA/HAAA.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HAAB.txt -O HA/HAAB.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HAAC.txt -O HA/HAAC.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HAAD.txt -O HA/HAAD.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HABA.txt -O HA/HABA.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HABB.txt -O HA/HABB.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HABC.txt -O HA/HABC.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HABD.txt -O HA/HABD.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HACA.txt -O HA/HACA.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HACB.txt -O HA/HACB.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HACC.txt -O HA/HACC.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HACD.txt -O HA/HACD.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HAEA.txt -O HA/HAEA.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HAEB.txt -O HA/HAEB.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HAEC.txt -O HA/HAEC.txt
+mkdir -pv HA && wget http://files.docking.org/2D/HA/HAED.txt -O HA/HAED.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IAAA.txt -O IA/IAAA.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IAAB.txt -O IA/IAAB.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IAAC.txt -O IA/IAAC.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IAAD.txt -O IA/IAAD.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IABA.txt -O IA/IABA.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IABB.txt -O IA/IABB.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IABC.txt -O IA/IABC.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IABD.txt -O IA/IABD.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IACA.txt -O IA/IACA.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IACB.txt -O IA/IACB.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IACC.txt -O IA/IACC.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IACD.txt -O IA/IACD.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IAEA.txt -O IA/IAEA.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IAEB.txt -O IA/IAEB.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IAEC.txt -O IA/IAEC.txt
+mkdir -pv IA && wget http://files.docking.org/2D/IA/IAED.txt -O IA/IAED.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JAAA.txt -O JA/JAAA.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JAAB.txt -O JA/JAAB.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JAAC.txt -O JA/JAAC.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JAAD.txt -O JA/JAAD.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JABA.txt -O JA/JABA.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JABB.txt -O JA/JABB.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JABC.txt -O JA/JABC.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JABD.txt -O JA/JABD.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JACA.txt -O JA/JACA.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JACB.txt -O JA/JACB.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JACC.txt -O JA/JACC.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JACD.txt -O JA/JACD.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JAEA.txt -O JA/JAEA.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JAEB.txt -O JA/JAEB.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JAEC.txt -O JA/JAEC.txt
+mkdir -pv JA && wget http://files.docking.org/2D/JA/JAED.txt -O JA/JAED.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KAAA.txt -O KA/KAAA.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KAAB.txt -O KA/KAAB.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KAAC.txt -O KA/KAAC.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KAAD.txt -O KA/KAAD.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KABA.txt -O KA/KABA.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KABB.txt -O KA/KABB.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KABC.txt -O KA/KABC.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KABD.txt -O KA/KABD.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KACA.txt -O KA/KACA.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KACB.txt -O KA/KACB.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KACC.txt -O KA/KACC.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KACD.txt -O KA/KACD.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KAEA.txt -O KA/KAEA.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KAEB.txt -O KA/KAEB.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KAEC.txt -O KA/KAEC.txt
+mkdir -pv KA && wget http://files.docking.org/2D/KA/KAED.txt -O KA/KAED.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBCA.txt -O BB/BBCA.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBCB.txt -O BB/BBCB.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBCC.txt -O BB/BBCC.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBCD.txt -O BB/BBCD.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBEA.txt -O BB/BBEA.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBEB.txt -O BB/BBEB.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBEC.txt -O BB/BBEC.txt
+mkdir -pv BB && wget http://files.docking.org/2D/BB/BBED.txt -O BB/BBED.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBAA.txt -O CB/CBAA.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBAB.txt -O CB/CBAB.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBAC.txt -O CB/CBAC.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBAD.txt -O CB/CBAD.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBBA.txt -O CB/CBBA.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBBB.txt -O CB/CBBB.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBBC.txt -O CB/CBBC.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBBD.txt -O CB/CBBD.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBCA.txt -O CB/CBCA.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBCB.txt -O CB/CBCB.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBCC.txt -O CB/CBCC.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBCD.txt -O CB/CBCD.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBEA.txt -O CB/CBEA.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBEB.txt -O CB/CBEB.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBEC.txt -O CB/CBEC.txt
+mkdir -pv CB && wget http://files.docking.org/2D/CB/CBED.txt -O CB/CBED.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBAA.txt -O DB/DBAA.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBAB.txt -O DB/DBAB.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBAC.txt -O DB/DBAC.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBAD.txt -O DB/DBAD.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBBA.txt -O DB/DBBA.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBBB.txt -O DB/DBBB.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBBC.txt -O DB/DBBC.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBBD.txt -O DB/DBBD.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBCA.txt -O DB/DBCA.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBCB.txt -O DB/DBCB.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBCC.txt -O DB/DBCC.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBCD.txt -O DB/DBCD.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBEA.txt -O DB/DBEA.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBEB.txt -O DB/DBEB.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBEC.txt -O DB/DBEC.txt
+mkdir -pv DB && wget http://files.docking.org/2D/DB/DBED.txt -O DB/DBED.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBAA.txt -O EB/EBAA.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBAB.txt -O EB/EBAB.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBAC.txt -O EB/EBAC.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBAD.txt -O EB/EBAD.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBBA.txt -O EB/EBBA.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBBB.txt -O EB/EBBB.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBBC.txt -O EB/EBBC.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBBD.txt -O EB/EBBD.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBCA.txt -O EB/EBCA.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBCB.txt -O EB/EBCB.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBCC.txt -O EB/EBCC.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBCD.txt -O EB/EBCD.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBEA.txt -O EB/EBEA.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBEB.txt -O EB/EBEB.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBEC.txt -O EB/EBEC.txt
+mkdir -pv EB && wget http://files.docking.org/2D/EB/EBED.txt -O EB/EBED.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBAA.txt -O FB/FBAA.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBAB.txt -O FB/FBAB.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBAC.txt -O FB/FBAC.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBAD.txt -O FB/FBAD.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBBA.txt -O FB/FBBA.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBBB.txt -O FB/FBBB.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBBC.txt -O FB/FBBC.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBBD.txt -O FB/FBBD.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBCA.txt -O FB/FBCA.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBCB.txt -O FB/FBCB.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBCC.txt -O FB/FBCC.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBCD.txt -O FB/FBCD.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBEA.txt -O FB/FBEA.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBEB.txt -O FB/FBEB.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBEC.txt -O FB/FBEC.txt
+mkdir -pv FB && wget http://files.docking.org/2D/FB/FBED.txt -O FB/FBED.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBAA.txt -O GB/GBAA.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBAB.txt -O GB/GBAB.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBAC.txt -O GB/GBAC.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBAD.txt -O GB/GBAD.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBBA.txt -O GB/GBBA.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBBB.txt -O GB/GBBB.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBBC.txt -O GB/GBBC.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBBD.txt -O GB/GBBD.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBCA.txt -O GB/GBCA.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBCB.txt -O GB/GBCB.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBCC.txt -O GB/GBCC.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBCD.txt -O GB/GBCD.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBEA.txt -O GB/GBEA.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBEB.txt -O GB/GBEB.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBEC.txt -O GB/GBEC.txt
+mkdir -pv GB && wget http://files.docking.org/2D/GB/GBED.txt -O GB/GBED.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBAA.txt -O HB/HBAA.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBAB.txt -O HB/HBAB.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBAC.txt -O HB/HBAC.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBAD.txt -O HB/HBAD.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBBA.txt -O HB/HBBA.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBBB.txt -O HB/HBBB.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBBC.txt -O HB/HBBC.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBBD.txt -O HB/HBBD.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBCA.txt -O HB/HBCA.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBCB.txt -O HB/HBCB.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBCC.txt -O HB/HBCC.txt
+mkdir -pv HB && wget http://files.docking.org/2D/HB/HBCD.txt -O HB/HBCD.txt

data/zinc/zinc_complete/run_download.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import concurrent.futures
+import subprocess
+shell_file = "download_zinc.sh"
+num_parallel = 8
+def execute_command(command):
+    print("Running: ", command)
+    subprocess.run(command, shell=True)
+commands = []
+with open(shell_file, "r") as file:
+    for line in file:
+        line = line.strip()
+        if line.startswith("mkdir") and "wget" in line:
+            commands.append(line)
+with concurrent.futures.ThreadPoolExecutor() as executor:
+    executor.map(execute_command, commands, chunksize=num_parallel)
+print("Downloads completed")

demonstrator.ipynb ADDED Viewed

	@@ -0,0 +1,521 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Demonstrator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:sample:Compiling the model...\n"
+     ]
+    }
+   ],
+   "source": [
+    "import rdkit\n",
+    "from rdkit import Chem\n",
+    "import rdkit.rdBase as rkrb\n",
+    "import rdkit.RDLogger as rkl\n",
+    "import os\n",
+    "import torch \n",
+    "import logging\n",
+    "import numpy as np\n",
+    "from plot_utils import check_metrics\n",
+    "from sample import Sampler\n",
+    "import pandas as pd\n",
+    "\n",
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
+    "\n",
+    "if \"cuda\" in device:\n",
+    "    # dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'\n",
+    "    dtype = \"float16\" if torch.cuda.is_available() else \"float32\"\n",
+    "else:\n",
+    "    dtype = \"float32\"\n",
+    "\n",
+    "logger = rkl.logger()\n",
+    "logger.setLevel(rkl.ERROR)\n",
+    "rkrb.DisableLog(\"rdApp.error\")\n",
+    "\n",
+    "torch.set_num_threads(8)\n",
+    "logging.basicConfig(level=logging.INFO)\n",
+    "logger = logging.getLogger(__name__)\n",
+    "\n",
+    "sampler = Sampler(\n",
+    "    load_path=os.path.join(\n",
+    "        os.getcwd(), \"out\", \"llama2-M-Full-RSS.pt\"\n",
+    "    ),\n",
+    "    device=device,\n",
+    "    seed=1234,\n",
+    "    dtype=dtype,\n",
+    "    compile=True,\n",
+    ")\n",
+    "\n",
+    "    \n",
+    "num_samples = 100\n",
+    "df_comp = pd.read_parquet(os.path.join(os.getcwd(),\"data\",\"OrganiX13.parquet\"))\n",
+    "df_comp = df_comp.sample(n=2_500_000)\n",
+    "comp_context_dict = {c: df_comp[c].to_numpy() for c in [\"logp\", \"sascore\", \"mol_weight\"]} \n",
+    "comp_smiles = df_comp[\"smiles\"]\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:root:Wrote file /home/ndobberstein/Projekte/llama2-molgen/chemiscope_gen.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "from typing import List, Dict\n",
+    "import json\n",
+    "from rdkit.Chem import AllChem\n",
+    "\n",
+    "@torch.no_grad()\n",
+    "def convert_to_chemiscope(smiles_list : List[str], context_dict : Dict[str, List[float]]):\n",
+    "    # For more details on the file format: https://chemiscope.org/docs/tutorial/input-reference.html\n",
+    "\n",
+    "    structures = []\n",
+    "    remove_list = []\n",
+    "    for i,smi in enumerate(smiles_list):\n",
+    "        mol = Chem.MolFromSmiles(smi)\n",
+    "        if mol is None:\n",
+    "            logging.info(f\"Mol invalid: {smi} ! Skipping...\")\n",
+    "            remove_list.append(i)\n",
+    "            continue\n",
+    "\n",
+    "        res = AllChem.EmbedMolecule(mol,randomSeed=0xf00d, maxAttempts=20)\n",
+    "        # res = AllChem.Compute2DCoords(mol)\n",
+    "\n",
+    "        if res != 0:\n",
+    "            logging.info(f\"Could not calculate coordinates for {smi}! Skipping..\")\n",
+    "            remove_list.append(i)\n",
+    "            continue\n",
+    "        \n",
+    "\n",
+    "        conf = list(mol.GetConformers())[0]\n",
+    "        x,y,z = [],[],[]\n",
+    "        symbols = []\n",
+    "        for atom, coords in zip(mol.GetAtoms(), conf.GetPositions()):\n",
+    "            symbols.append(atom.GetSymbol())\n",
+    "            x.append(coords[0])\n",
+    "            y.append(coords[1])\n",
+    "            z.append(coords[2])\n",
+    "        \n",
+    "        structures.append({\n",
+    "            \"size\": len(x),\n",
+    "            \"names\": symbols,\n",
+    "            \"x\": x,\n",
+    "            \"y\": y,\n",
+    "            \"z\" : z\n",
+    "        })\n",
+    "\n",
+    "\n",
+    "\n",
+    "    properties = {}\n",
+    "    \n",
+    "    for c in context_dict:\n",
+    "        properties[c] = {\n",
+    "            \"target\": \"structure\",\n",
+    "            \"values\": [v for i, v in enumerate(context_dict[c]) if i not in remove_list]\n",
+    "        }\n",
+    "        \n",
+    "\n",
+    "\n",
+    "    \n",
+    "    data = {\n",
+    "        \"meta\": {\n",
+    "            # // the name of the dataset\n",
+    "            \"name\": \"Test Dataset\",\n",
+    "            # // description of the dataset, OPTIONAL\n",
+    "            \"description\": \"This contains data from generated molecules\",\n",
+    "            # // authors of the dataset, OPTIONAL\n",
+    "            \"authors\": [\"Niklas Dobberstein, [email protected]\"],\n",
+    "            # // references for the dataset, OPTIONAL\n",
+    "            \"references\": [\n",
+    "                \"\",\n",
+    "            ],\n",
+    "        \n",
+    "        },\n",
+    "        \"properties\": properties,\n",
+    "        \"structures\": structures\n",
+    "    }\n",
+    "    \n",
+    "    out_path = os.path.join(os.getcwd(), \"chemiscope_gen.json\")\n",
+    "    with open(out_path, \"w\") as f:\n",
+    "        json.dump(data, f)\n",
+    "\n",
+    "    logging.info(f\"Wrote file {out_path}\")\n",
+    "\n",
+    "convert_to_chemiscope([\n",
+    "    \"CC=O\",\n",
+    "    \"s1ccnc1\"\n",
+    "], {\"logp\": [1.0,2.0], \"sascore\": [1.5,-2.0]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8b28a4e692de4bb48fde10a88d9727ba",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(Checkbox(value=False, description='logp'), Checkbox(value=False, description='sascore'), Checkb…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "62331a62f2bf4d08a3a202ad277c6d92",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "HBox(children=(FloatSlider(value=0.0, description='logp:', max=7.0, min=-4.0, step=0.5), FloatSlider(value=2.0…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2d498af39f4046b0a5bb92080361dfec",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Text(value='', description='Context SMI:')"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ed8a755253444e9c83dc27c5f830588b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "FloatSlider(value=0.8, description='Temperature:', max=2.0)"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "139e7d1e40984101800e2cbb740280b0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Button(description='Generate', style=ButtonStyle())"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4d119a3b477243ac916478a6ec2a55c7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dfce28d4f6a3414c838e6542ffb43fc6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Output()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import ipywidgets as widgets\n",
+    "from IPython.display import display, clear_output, HTML\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "import matplotlib.pyplot as plt\n",
+    "from rdkit import Chem\n",
+    "from rdkit.Chem import Draw\n",
+    "import logging\n",
+    "from plot_utils import calc_context_from_smiles\n",
+    "\n",
+    "# Define the context_cols options and create checkboxes for them\n",
+    "context_cols_options = [\"logp\", \"sascore\", \"mol_weight\"]\n",
+    "context_cols_checkboxes = [widgets.Checkbox(description=col, value=False) for col in context_cols_options]\n",
+    "\n",
+    "# Create a text input for context_smi\n",
+    "context_smi_input = widgets.Text(description=\"Context SMI:\", value=\"\")\n",
+    "\n",
+    "# Create sliders for temperature and context_cols values\n",
+    "temperature_slider = widgets.FloatSlider(description=\"Temperature:\", min=0, max=2.0, step=0.1, value=0.8)\n",
+    "\n",
+    "logp_slider = widgets.FloatSlider(description=\"logp:\", min=-4, max=7, step=0.5, value=0.0)\n",
+    "sascore_slider = widgets.FloatSlider(description=\"sascore:\", min=1, max=10, step=0.5, value=2.0)\n",
+    "mol_weight_slider = widgets.FloatSlider(description=\"mol_weight:\", min=0.5, max=10, step=0.5, value=3.0)\n",
+    "\n",
+    "# Create a button to generate the code and display SMILES\n",
+    "generate_button = widgets.Button(description=\"Generate\")\n",
+    "\n",
+    "# Create an output widget for displaying generated information\n",
+    "output = widgets.Output()\n",
+    "\n",
+    "# Create an output widget for displaying the RDKit molecules\n",
+    "molecule_output = widgets.Output()\n",
+    "\n",
+    "@torch.no_grad()\n",
+    "def generate_code(_):\n",
+    "    with output:\n",
+    "        clear_output(wait=False)\n",
+    "        # logging.info(\"Parameters used in generation:\")\n",
+    "        \n",
+    "        # Get the selected context_cols\n",
+    "        selected_context_cols = [col for col, checkbox in zip(context_cols_options, context_cols_checkboxes) if checkbox.value]\n",
+    "        # logging.info(f\"Context Cols: {selected_context_cols}\")\n",
+    "        \n",
+    "        # Get the values of context_smi and temperature from the sliders\n",
+    "        context_smi = context_smi_input.value.strip()\n",
+    "        temperature = temperature_slider.value\n",
+    "        # logging.info(f\"Context Smiles: {context_smi}\")\n",
+    "        # logging.info(f\"Temperature: {temperature}\")\n",
+    "        \n",
+    "        # Get the values of logp, sascore, and mol_weight from the sliders\n",
+    "        context_dict = {} if len(selected_context_cols) != 0 else None\n",
+    "        for c in selected_context_cols:\n",
+    "            if c == \"logp\":\n",
+    "                val = logp_slider.value\n",
+    "            elif c == \"sascore\":\n",
+    "                val = sascore_slider.value\n",
+    "            else:\n",
+    "                val = mol_weight_slider.value\n",
+    "            val = round(val, 2)\n",
+    "            context_dict[c] = val*torch.ones((num_samples,),device=device,dtype=torch.float)\n",
+    "            # logging.info(f\"{c}: {val}\")\n",
+    "        \n",
+    "        # Generate SMILES using the provided context\n",
+    "        smiles, context = sampler.generate(\n",
+    "            context_cols=context_dict,\n",
+    "            context_smi=context_smi,\n",
+    "            start_smiles=None,\n",
+    "            num_samples=num_samples,\n",
+    "            max_new_tokens=256,\n",
+    "            temperature=temperature,\n",
+    "            top_k=25,\n",
+    "            total_gen_steps=int(np.ceil(num_samples / 1000)),\n",
+    "            return_context=True\n",
+    "        )\n",
+    "        \n",
+    "        with open(os.path.join(os.getcwd(), \"gen_smiles.txt\"), \"w\") as f:\n",
+    "            for s in smiles:\n",
+    "                f.write(f\"{s}\\n\")\n",
+    "        # Display SMILES as RDKit molecules\n",
+    "        display_molecules(smiles, context)\n",
+    "\n",
+    "\n",
+    "\n",
+    "def display_molecules(smiles_list, context_dict):\n",
+    "    with molecule_output:\n",
+    "        clear_output(wait=False)\n",
+    "        molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]\n",
+    "        \n",
+    "        # Convert RDKit molecules to images and store them in a list\n",
+    "        images = [Draw.MolToImage(mol) for mol in molecules]\n",
+    "        \n",
+    "        # Create a subplot grid to display the images\n",
+    "        num_images = len(images)\n",
+    "        num_cols = 5  # Number of columns in the grid\n",
+    "        num_rows = (num_images + num_cols - 1) // num_cols  # Calculate the number of rows\n",
+    "        \n",
+    "        fig, axes = plt.subplots(num_rows, num_cols, figsize=(25, 25))\n",
+    "        fig.subplots_adjust(hspace=0.5)\n",
+    "        calculated_context = {c:[] for c in context_dict}\n",
+    "        for i, ax in enumerate(axes.flat):\n",
+    "            if i < num_images:\n",
+    "                ax.imshow(images[i])\n",
+    "                for j, c in enumerate(context_dict):\n",
+    "                    smiles = smiles_list[i]\n",
+    "                    smi_con = round(calc_context_from_smiles([smiles], c)[0],2)\n",
+    "                    calculated_context[c].append(smi_con)\n",
+    "                    ax.text(0.5, -0.1 * j , f\"{c}: {context_dict[c][i]} vs {smi_con}\", transform=ax.transAxes, fontsize=10, ha='center')\n",
+    "                \n",
+    "                ax.axis('off')\n",
+    "            else:\n",
+    "                fig.delaxes(ax)  # Remove empty subplots if there are more rows than images\n",
+    "        \n",
+    "\n",
+    "        if len(context_dict) >= 2:\n",
+    "            convert_to_chemiscope(smiles_list, calculated_context)\n",
+    "\n",
+    "        plt.savefig(\"gen_mols.png\")\n",
+    "        plt.show()\n",
+    "\n",
+    "# Attach the generate_code function to the button's click event\n",
+    "generate_button.on_click(generate_code)\n",
+    "\n",
+    "# Display the widgets\n",
+    "display(widgets.HBox(context_cols_checkboxes))\n",
+    "display(widgets.HBox((logp_slider, sascore_slider, mol_weight_slider)))\n",
+    "\n",
+    "display(context_smi_input)\n",
+    "display(temperature_slider)\n",
+    "display(generate_button)\n",
+    "display(output)\n",
+    "display(molecule_output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ea96e00e0ea8448d97906ec965f04788",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Batch:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "77ba2d72172846e18572c94bc5b3bd6f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generation:   0%|          | 0/256 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:sample:Number valid generated: 68.0 %\n",
+      "INFO:sample:---------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "selected_context_cols = [\"logp\", \"sascore\", \"mol_weight\"]\n",
+    "num_samples = 25\n",
+    "context_dict = {} if len(selected_context_cols) != 0 else None\n",
+    "for c in selected_context_cols:\n",
+    "    if c == \"logp\":\n",
+    "        v = 0.5 * torch.randint(\n",
+    "            -8, 14, (num_samples,), device=device, dtype=torch.float\n",
+    "        )\n",
+    "        context_dict[c] = v.sort()[0]\n",
+    "    elif c == \"sascore\":\n",
+    "        v = 0.5 * torch.randint(\n",
+    "            1, 20, (num_samples,), device=device, dtype=torch.float\n",
+    "        )\n",
+    "        context_dict[c] = v.sort()[0]\n",
+    "    else:\n",
+    "        v = 0.5 * torch.randint(\n",
+    "            1, 20, (num_samples,), device=device, dtype=torch.float\n",
+    "        )\n",
+    "        \n",
+    "        context_dict[c] = v.sort()[0]\n",
+    "    # logging.info(f\"{c}: {val}\")\n",
+    "\n",
+    "# Generate SMILES using the provided context\n",
+    "smiles, context = sampler.generate(\n",
+    "    context_cols=context_dict,\n",
+    "    context_smi=None,\n",
+    "    start_smiles=None,\n",
+    "    num_samples=num_samples,\n",
+    "    max_new_tokens=256,\n",
+    "    temperature=0.8,\n",
+    "    top_k=25,\n",
+    "    total_gen_steps=int(np.ceil(num_samples / 1000)),\n",
+    "    return_context=True\n",
+    ")\n",
+    "\n",
+    "# Display SMILES as RDKit molecules\n",
+    "display_molecules(smiles, context)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch2-bachelor",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.18"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

fragment_creator.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from abc import ABC
+from dataclasses import dataclass
+from typing import List, Union
+import numpy as np
+from rdkit import Chem
+from rdkit.Chem.BRICS import BRICSDecompose
+from rdkit.Chem.Recap import RecapDecompose
+import random
+@dataclass
+class Fragment:
+    smiles: Union[str, None]
+    tokens: Union[List[int], None]
+class BaseFragmentCreator(ABC):
+    """
+    Is the base class for all fragment creator and does nothing to the smiles
+    """
+    def __init__(self) -> None:
+        pass
+    def create_fragment(self, frag: Fragment) -> Fragment:
+        return ""
+# This is the method used in the paper
+class RandomSubsliceFragmentCreator(BaseFragmentCreator):
+    def __init__(self, max_fragment_size=50) -> None:
+        super().__init__()
+        self.max_fragment_size = max_fragment_size
+    def create_fragment(self, frag: Fragment) -> Fragment:
+        """
+        Creates the random sub slice fragments from the tokens
+        """
+        tokens = frag.tokens
+        startIdx = np.random.randint(0, len(tokens) - 1)
+        endIdx = np.random.randint(
+            startIdx + 1, min(len(tokens), startIdx + self.max_fragment_size)
+        )
+        return Fragment(smiles=None, tokens=tokens[startIdx:endIdx])
+class BricksFragmentCreator(BaseFragmentCreator):
+    def __init__(self) -> None:
+        super().__init__()
+    def create_fragment(self, frag: Fragment) -> Fragment:
+        """
+        Creates the Bricks fragments and takes one randomly
+        """
+        smiles = frag.smiles
+        m = Chem.MolFromSmiles(smiles)
+        if m is None:
+            return ""
+        res = list(BRICSDecompose(m, minFragmentSize=3))
+        # print(res)
+        return random.choice(res)
+class RecapFragmentCreator(BaseFragmentCreator):
+    def __init__(self) -> None:
+        super().__init__()
+    def create_fragment(self, frag: Fragment) -> Fragment:
+        """
+        Creates the Recap fragments and takes one randomly
+        """
+        smiles = frag.smiles
+        m = Chem.MolFromSmiles(smiles)
+        if m is None:
+            return ""
+        res = RecapDecompose(m, minFragmentSize=3).GetAllChildren()
+        # print(res)
+        return random.choice(res)
+class MolFragsFragmentCreator(BaseFragmentCreator):
+    def __init__(self) -> None:
+        super().__init__()
+    def create_fragment(self, frag: Fragment) -> Fragment:
+        """
+        Creates the Bricks fragments and takes one randomly
+        """
+        smiles = frag.smiles
+        m = Chem.MolFromSmiles(smiles)
+        if m is None:
+            return ""
+        res = list(Chem.rdmolops.GetMolFrags(m, asMols=True))
+        res = [Chem.MolToSmiles(m) for m in res]
+        # print(res)
+        return random.choice(res)
+def fragment_creator_factory(key: Union[str, None]):
+    if key is None:
+        return None
+    if key == "mol_frags":
+        return MolFragsFragmentCreator()
+    elif key == "recap":
+        return RecapFragmentCreator()
+    elif key == "bricks":
+        return BricksFragmentCreator()
+    elif key == "rss":
+        return RandomSubsliceFragmentCreator()
+    else:
+        raise ValueError(f"Do not have factory for the given key: {key}")
+if __name__ == "__main__":
+    from tokenizer import SmilesTokenizer
+    tokenizer = SmilesTokenizer()
+    creator = BricksFragmentCreator()
+    # creator = MolFragsFragmentCreator()
+    # creator = RecapFragmentCreator()
+    frag = creator.create_fragment("CC(=O)NC1=CC=C(C=C1)O")
+    print(frag)
+    tokens = tokenizer.encode(frag)
+    print(tokens)
+    print([tokenizer._convert_id_to_token(t) for t in tokens])

generate_paper_graphs.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+# TODO: Change FULL_PATH_TO_CONDA to the binary where the conda folder is: see https://github.com/conda/conda/issues/8536
+conda activate FULL_PATH_TO_CONDA/torch2-llamol
+array=( logp sascore mol_weight )
+# python sample.py --num_samples 20000 --num_samples_per_step 1000 --ckpt_path "out/llama2-M-Full-RSS.pt"  --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet"
+# for i in "${array[@]}"
+# do
+#     python sample.py --num_samples 10000 --num_samples_per_step 500 --kv_caching --ckpt_path "out/llama2-M-Full-RSS.pt" --context_cols "$i" --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet"
+# done
+# 2 Combinations
+python sample.py --num_samples 1000 --seed 4321 --kv_caching --ckpt_path "out/llama2-M-Full-RSS.pt" --context_cols logp sascore --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet"
+python sample.py --num_samples 1000 --seed 4321 --kv_caching --ckpt_path "out/llama2-M-Full-RSS.pt" --context_cols logp mol_weight --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet"
+python sample.py --num_samples 1000 --seed 4321 --kv_caching --ckpt_path "out/llama2-M-Full-RSS.pt" --context_cols sascore mol_weight --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet"
+# # # All 3
+# python sample.py --num_samples 1000 --ckpt_path "out/llama2-M-Full-RSS.pt" --context_cols logp sascore mol_weight --kv_caching --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --seed 4312

get_fragment_table.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/bin/bash
+# TODO: Change FULL_PATH_TO_CONDA to the binary where the conda folder is: see https://github.com/conda/conda/issues/8536
+conda activate FULL_PATH_TO_CONDA/torch2-llamol
+# context_smiles=("c1ccccc1" "s1cccc1" "C1=CSC=C1" "CC1=CSC=C1" "C1=CC=C2C(=C1)C3=CC=CC=C3S2" "CCO" "CC=O" "CC(=O)OC1=CC=CC=C1C(=O)O" "CC(=O)NC1=CC=C(C=C1)O" "CC(C)CC1=CC=C(C=C1)C(C)C(=O)O" "OC(=O)C(C)c1ccc(cc1)CC(C)C" "C1C(=O)NC(=O)NC1=O" "CN1C=NC2=C1C(=O)N(C(=O)N2C)C" "CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(C=C4)O" "CN1CCC23C4C1CC5=C2C(=C(C=C5)OC)OC3C(=O)CC4")
+# context_smiles=("CN1CCC23C4C1CC5=C2C(=C(C=C5)O)OC3C(C=C4)O" "CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@H]3[C@H](C=C4)O" "CN1CCC23C4C1CC5=C2C(=C(C=C5)OC)OC3C(=O)CC4" "CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)OC)O[C@H]3C(=O)CC4" )
+# context_smiles=("C1=CSC=C1" )
+context_smiles=("C1=CSC=C1" "CC=O" "CC(=O)NC1=CC=C(C=C1)O" "CN1C=NC2=C1C(=O)N(C(=O)N2C)C")
+for smi in "${context_smiles[@]}"; do
+    # Only fragment generation
+    # output=$(python sample.py --num_samples 1000 --ckpt_path "out/llama2-M-Full-RSS.pt" --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --context_smi "$smi")
+    # Fragment and LogP
+    # output=$(python sample.py --num_samples 1000 --ckpt_path "out/llama2-M-Full-RSS.pt" --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --context_smi "$smi" --context_cols "logp" )
+    # Fragment and Sascore
+    # output=$(python sample.py --num_samples 1000 --ckpt_path "out/llama2-M-Full-RSS.pt" --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --context_smi "$smi" --context_cols "sascore" )
+    # Fragment and Mol weight
+    # output=$(python sample.py --num_samples 1000 --ckpt_path "out/llama2-M-Full-RSS.pt" --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --context_smi "$smi" --context_cols "mol_weight" )
+    # Multi Fragment Condition
+    # Logp + Sascore
+    # output=$(python sample.py --num_samples 1000 --ckpt_path "out/llama2-M-Full-RSS.pt" --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --context_smi "$smi" --context_cols "logp" "sascore" )
+    # Logp + Mol Weight
+    # output=$(python sample.py --num_samples 1000 --ckpt_path "out/llama2-M-Full-RSS.pt" --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --context_smi "$smi" --context_cols "logp" "mol_weight" )
+    # Sascore + Mol Weight
+    # output=$(python sample.py --num_samples 1000 --ckpt_path "out/llama2-M-Full-RSS.pt" --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --context_smi "$smi" --context_cols "sascore" "mol_weight" )
+    # Logp +  Sascore + Mol Weight
+    output=$(python sample.py --num_samples 1000 --ckpt_path "out/llama2-M-Full-RSS.pt" --max_new_tokens 256 --cmp_dataset_path="data/OrganiX13.parquet" --context_smi "$smi" --context_cols "logp" "sascore" "mol_weight" )
+    echo "SMI: $smi"
+    echo "----------------------"
+done

model.py ADDED Viewed

	@@ -0,0 +1,787 @@

+from __future__ import annotations
+import math
+import pickle
+import struct
+import inspect
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional, Tuple, List, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from tqdm.auto import tqdm
+from tokenizer import SmilesTokenizer
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
+    norm_eps: float = 1e-5
+    max_seq_len: int = 2048
+    dropout: float = 0.0
+@dataclass
+class ContextArgs:
+    context_keys: List[str] = field(default_factory=list)
+    context_dims: List[int] = field(default_factory=list)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device)  # type: ignore
+    freqs = torch.outer(t, freqs).float()  # type: ignore
+    freqs_cos = torch.cos(freqs)  # real part
+    freqs_sin = torch.sin(freqs)  # imaginary part
+    return freqs_cos, freqs_sin
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(shape)
+def apply_rotary_emb(
+    xq: torch.Tensor, xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # reshape xq and xk to match the complex representation
+    xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
+    xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
+    # reshape freqs_cos and freqs_sin for broadcasting
+    freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
+    freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)
+    # apply rotation using real numbers
+    xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
+    xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
+    xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
+    xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
+    # flatten last two dimensions
+    xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
+    xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+class Attention(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        model_parallel_size = 1
+        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
+        self.attn_dropout = nn.Dropout(args.dropout)
+        self.resid_dropout = nn.Dropout(args.dropout)
+        self.dropout = args.dropout
+        self.cache_hash = None
+        # use flash attention or a manual implementation?
+        self.flash = hasattr(torch.nn.functional, "scaled_dot_product_attention")
+        if not self.flash:
+            print(
+                "WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0"
+            )
+            mask = torch.full((1, 1, args.max_seq_len, args.max_seq_len), float("-inf"))
+            mask = torch.triu(mask, diagonal=1)
+            self.register_buffer("mask", mask)
+    def forward(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ):
+        bsz, seqlen, _ = x.shape
+        # QKV
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        # RoPE relative positional embeddings
+        xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
+        # grouped multiquery attention: expand out keys and values
+        xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        # make heads into a batch dimension
+        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+        # flash implementation
+        if self.flash:
+            output = torch.nn.functional.scaled_dot_product_attention(
+                xq,
+                xk,
+                xv,
+                attn_mask=None,
+                dropout_p=self.dropout if self.training else 0.0,
+                is_causal=True,
+            )
+        else:
+            # manual implementation
+            scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
+            assert hasattr(self, "mask")
+            scores = (
+                scores + self.mask[:, :, :seqlen, :seqlen]
+            )  # (bs, n_local_heads, seqlen, cache_len + seqlen)
+            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+            scores = self.attn_dropout(scores)
+            output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)
+        # restore time as batch dimension and concat heads
+        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        # final projection into the residual stream
+        output = self.wo(output)
+        output = self.resid_dropout(output)
+        return output
+    def forward_with_kvcache(
+        self,
+        x: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+        cache_id: int = 1,
+    ):
+        bsz, seqlen, _ = x.shape
+        original_x = x
+        use_cache = self.cache_hash == cache_id
+        if use_cache:
+            x = x[:, -1, :].unsqueeze(1)  # only need the last new token
+        # QKV
+        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        if use_cache:
+            # comp_xq, comp_xk, comp_xv = self.wq(original_x), self.wk(original_x), self.wv(original_x)
+            # comp_xq = comp_xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+            # comp_xk = comp_xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+            # comp_xv = comp_xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+            # # RoPE relative positional embeddings
+            # comp_xq, comp_xk = apply_rotary_emb(comp_xq, comp_xk, freqs_cos, freqs_sin)
+            self.k_cache = torch.concat([self.k_cache, xk.clone()], dim=1)
+            self.v_cache = torch.concat([self.v_cache, xv.clone()], dim=1)
+            # print("Before positional xk:", torch.all(self.k_cache == self.wk(original_x)))
+            # print("Before positional xv:", torch.all(self.v_cache == self.wv(original_x)))
+            seqlen = self.k_cache.size(1)
+            xk = self.k_cache
+            xv = self.v_cache
+            self.cache_hash = cache_id
+            xq = xq.view(bsz, 1, self.n_local_heads, self.head_dim)
+            xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+            xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+            # RoPE relative positional embeddings
+            # xq, xk = apply_rotary_emb(xq, xk[:,-1,:,:].unsqueeze(1), freqs_cos[-1,:].unsqueeze(0), freqs_sin[-1,:].unsqueeze(0))
+            # reshape xq and xk to match the complex representation
+            xq_r, xq_i = xq.float().reshape(xq.shape[:-1] + (-1, 2)).unbind(-1)
+            xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
+            # reshape freqs_cos and freqs_sin for broadcasting
+            q_freq_cos = freqs_cos[-1, :].unsqueeze(0)
+            q_freq_sin = freqs_sin[-1, :].unsqueeze(0)
+            freqs_cos_q = reshape_for_broadcast(q_freq_cos, xq_r)
+            freqs_sin_q = reshape_for_broadcast(q_freq_sin, xq_r)
+            freqs_cos_k = reshape_for_broadcast(freqs_cos, xk_r)
+            freqs_sin_k = reshape_for_broadcast(freqs_sin, xk_r)
+            # apply rotation using real numbers
+            xq_out_r = xq_r * freqs_cos_q - xq_i * freqs_sin_q
+            xq_out_i = xq_r * freqs_sin_q + xq_i * freqs_cos_q
+            xk_out_r = xk_r * freqs_cos_k - xk_i * freqs_sin_k
+            xk_out_i = xk_r * freqs_sin_k + xk_i * freqs_cos_k
+            # flatten last two dimensions
+            xq_out = torch.stack([xq_out_r, xq_out_i], dim=-1).flatten(3)
+            xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
+            xq, xk = xq_out.type_as(xq), xk_out.type_as(xk)
+            # print(f"Seq len {xk.shape[1]} xq:", torch.allclose(xq , comp_xq[:,-1,:].unsqueeze(1), atol=1e-7), torch.mean(xq - comp_xq[:,-1,:].unsqueeze(1)))
+            # print(f"Seq len {xk.shape[1]} xk:",  torch.allclose(xk ,comp_xk, atol=1e-7), torch.mean(xk - comp_xk))
+            # print(f"Seq len {xk.shape[1]} xv:",  torch.allclose(xv , comp_xv, atol=1e-7), torch.mean(xv - comp_xv))
+            # print("-"*10)
+            # self.old_x = original_x
+        else:
+            self.k_cache = xk
+            self.v_cache = xv
+            self.old_x = x
+            xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+            xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+            xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+            self.cache_hash = cache_id
+            # RoPE relative positional embeddings
+            xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)
+        # grouped multiquery attention: expand out keys and values
+        xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
+        # make heads into a batch dimension
+        xq = xq.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
+        xk = xk.transpose(1, 2)
+        xv = xv.transpose(1, 2)
+        # flash implementation
+        if self.flash:
+            output = torch.nn.functional.scaled_dot_product_attention(
+                xq,
+                xk,
+                xv,
+                attn_mask=None,
+                dropout_p=self.dropout if self.training else 0.0,
+                # NOTE: VERY IMPORTANT to set is_causal=False, OTHERWISE the KV-Caching just breaks
+                is_causal=False,
+            )
+        else:
+            # manual implementation
+            scores = torch.matmul(xq, xk.transpose(2, 3)) / math.sqrt(self.head_dim)
+            assert hasattr(self, "mask")
+            scores = (
+                scores + self.mask[:, :, :seqlen, :seqlen]
+            )  # (bs, n_local_heads, seqlen, cache_len + seqlen)
+            scores = F.softmax(scores.float(), dim=-1).type_as(xq)
+            scores = self.attn_dropout(scores)
+            output = torch.matmul(scores, xv)  # (bs, n_local_heads, seqlen, head_dim)
+        # restore time as batch dimension and concat heads
+        # if use_cache:
+        #     # original_x[:,-1,:] = output.transpose(1, 2).contiguous().view(bsz,-1)
+        #     # output = original_x
+        #     output = torch.concat( [self.out_cache, output.transpose(1, 2).view(bsz,1,-1)], dim=1).contiguous()
+        #     self.out_cache = output
+        # else:
+        #     output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
+        #     self.out_cache = output
+        # NOTE: only work when fed in one token at a time (e.g. seq = 1)
+        output = output.transpose(1, 2).contiguous().view(bsz, x.size(1), -1)
+        # final projection into the residual stream
+        output = self.wo(output)
+        output = self.resid_dropout(output)
+        return output
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int, multiple_of: int, dropout: float):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=False)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.dropout(self.w2(F.silu(self.w1(x)) * self.w3(x)))
+class TransformerBlock(nn.Module):
+    def __init__(self, layer_id: int, args: ModelArgs):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args)
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            dropout=args.dropout,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
+    def forward(self, x, freqs_cos, freqs_sin):
+        h = x + self.attention.forward(self.attention_norm(x), freqs_cos, freqs_sin)
+        out = h + self.feed_forward.forward(self.ffn_norm(h))
+        return out
+    def forward_with_kvcache(self, x, freqs_cos, freqs_sin, cache_id=1):
+        h = x + self.attention.forward_with_kvcache(
+            self.attention_norm(x), freqs_cos, freqs_sin, cache_id=cache_id
+        )
+        out = h + self.feed_forward.forward(self.ffn_norm(h))
+        return out
+class Transformer(nn.Module):
+    last_loss: Optional[torch.Tensor]
+    def __init__(self, params: ModelArgs, context_params: ContextArgs):
+        super().__init__()
+        self.params = params
+        self.context_params = context_params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+        self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.frag_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.frag_type_embedding = nn.Embedding(1, params.dim)
+        self.context_lookup = {k: i for i, k in enumerate(context_params.context_keys)}
+        self.conditions_type_embeddings = nn.Embedding(
+            len(context_params.context_keys), params.dim
+        )
+        self.conditions_embeddings_lookup = nn.ModuleDict(
+            {
+                k: nn.Sequential(
+                    nn.Linear(dim, params.dim, bias=True),
+                )
+                for k, dim in zip(
+                    context_params.context_keys, context_params.context_dims
+                )
+            }
+        )
+        self.dropout = nn.Dropout(params.dropout)
+        self.layers = torch.nn.ModuleList()
+        for layer_id in range(params.n_layers):
+            self.layers.append(TransformerBlock(layer_id, params))
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps)
+        self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
+        # share the unembedding parameters with the embedding parameters
+        self.tok_embeddings.weight = (
+            self.output.weight
+        )  # https://paperswithcode.com/method/weight-tying
+        # some useful precompute for the RoPE relative positional embeddings
+        freqs_cos, freqs_sin = precompute_freqs_cis(
+            self.params.dim // self.params.n_heads, self.params.max_seq_len
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+        # init all weights
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith("w3.weight") or pn.endswith("wo.weight"):
+                torch.nn.init.normal_(
+                    p, mean=0.0, std=0.02 / math.sqrt(2 * params.n_layers)
+                )
+        # Initialize attribute for the loss of the last forward call. This will be set if the forward is called with a targets tensor.
+        self.last_loss = None
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        targets: Optional[torch.Tensor] = None,
+        context: Optional[Dict[str, torch.Tensor]] = None,
+        fragment: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        bsz, seqlen = tokens.shape
+        device = tokens.device
+        h = self._add_context_to_seq(tokens, context, fragment, bsz, device)
+        context_seq_len = h.shape[1] - seqlen
+        bsz, seqlen, _ = h.shape
+        freqs_cos = self.freqs_cos[:seqlen]
+        freqs_sin = self.freqs_sin[:seqlen]
+        for layer in self.layers:
+            h = layer(h, freqs_cos, freqs_sin)
+        h = self.norm(h)
+        h = h[:, context_seq_len:]
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.output(h)
+            tmp_last_loss = F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                targets.reshape(-1),
+                ignore_index=0,  # Ignore Pad Tokens
+            )
+            # NOTE: This essentially does nothing for the computation,
+            # because we are multiplying the weights by zero.
+            # This *needs* to be done, so that we can train with DDP
+            # As due to the random training process some of the weights are not used in the forward pass
+            # That is unacceptable for the for the c10 backend and the training errors out.
+            # Maybe there is a better fix in the future, see:
+            # https://github.com/pytorch/pytorch/issues/43259
+            ddp_fix = sum(p.sum() for p in self.parameters())
+            zero_sum = ddp_fix * 0.0
+            self.last_loss = tmp_last_loss + zero_sum
+        else:
+            # inference-time mini-optimization: only forward the output on the very last position
+            logits = self.output(
+                h[:, [-1], :]
+            )  # note: using list [-1] to preserve the time dim
+            self.last_loss = None
+        return logits
+    def forward_with_kvcache(
+        self,
+        tokens: torch.Tensor,
+        targets: Optional[torch.Tensor] = None,
+        context: Optional[Dict[str, torch.Tensor]] = None,
+        fragment: Optional[torch.Tensor] = None,
+        cache_id: int = 1,
+        pos_seq_len: Optional[int] = None,
+    ) -> torch.Tensor:
+        bsz, seqlen = tokens.shape
+        device = tokens.device
+        h = self._add_context_to_seq(tokens, context, fragment, bsz, device)
+        context_seq_len = h.shape[1] - seqlen
+        bsz, seqlen, _ = h.shape
+        if pos_seq_len is None:
+            pos_seq_len = seqlen
+        else:
+            pos_seq_len = max(seqlen, pos_seq_len + context_seq_len)
+        freqs_cos = self.freqs_cos[:pos_seq_len]
+        freqs_sin = self.freqs_sin[:pos_seq_len]
+        for layer in self.layers:
+            h = layer.forward_with_kvcache(h, freqs_cos, freqs_sin, cache_id=cache_id)
+        h = self.norm(h)
+        h = h[:, context_seq_len:]
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.output(h)
+            tmp_last_loss = F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                targets.reshape(-1),
+                ignore_index=0,  # Ignore Pad Tokens
+            )
+            # NOTE: This essentially does nothing for the computation,
+            # because we are multiplying the weights by zero.
+            # This *needs* to be done, so that we can train with DDP
+            # As due to the random training process some of the weights are not used in the forward pass
+            # That is unacceptable for the for the c10 backend and the training errors out.
+            # Maybe there is a better fix in the future, see:
+            # https://github.com/pytorch/pytorch/issues/43259
+            ddp_fix = sum(p.sum() for p in self.parameters())
+            zero_sum = ddp_fix * 0.0
+            self.last_loss = tmp_last_loss + zero_sum
+        else:
+            # inference-time mini-optimization: only forward the output on the very last position
+            logits = self.output(
+                h[:, [-1], :]
+            )  # note: using list [-1] to preserve the time dim
+            self.last_loss = None
+        return logits
+    def _add_context_to_seq(self, tokens, context, fragment, bsz, device):
+        h = self.tok_embeddings(tokens)
+        h = self.dropout(h)
+        if fragment is not None:
+            fragment_type_enc = torch.zeros_like(
+                fragment, dtype=torch.long, device=device
+            )
+            h = torch.concat(
+                (
+                    self.tok_embeddings(fragment)
+                    + self.frag_embeddings(fragment)
+                    + self.frag_type_embedding(fragment_type_enc),
+                    h,
+                ),
+                dim=1,
+            )
+        if context is not None and len(context) != 0:
+            # context is a dictionary with key : context_tensor of shape (batch_size, context_dim)
+            type_ids = []
+            context_vals = []
+            for emb_key, context_val in context.items():
+                emb_context_val = self.conditions_embeddings_lookup[emb_key](
+                    context_val.unsqueeze(1).to(device)
+                ).unsqueeze(1)
+                context_vals.append(emb_context_val)
+                type_ids_tensor = torch.tensor(
+                    [self.context_lookup[emb_key]], device=device, dtype=torch.long
+                )
+                type_ids.append(type_ids_tensor)
+            context_types = (
+                torch.concat(type_ids, dim=0).reshape(-1, 1).expand(-1, bsz).T
+            )
+            # shape(len(context),batch_size, emb_size)
+            context_types = self.conditions_type_embeddings(context_types)
+            context_vals = torch.concat(context_vals, dim=1).to(device)
+            # SHAPE
+            h = torch.concat([context_vals + context_types, h], dim=1)
+        return h
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {"params": decay_params, "weight_decay": weight_decay},
+            {"params": nodecay_params, "weight_decay": 0.0},
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(
+            f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters"
+        )
+        print(
+            f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters"
+        )
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = "fused" in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == "cuda"
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(
+            optim_groups, lr=learning_rate, betas=betas, **extra_args
+        )
+        print(f"using fused AdamW: {use_fused}")
+        return optimizer
+    def estimate_mfu(self, fwdbwd_per_iter, dt):
+        """estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS"""
+        # first estimate the number of flops we do per iteration.
+        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
+        N = sum(p.numel() for p in self.parameters())
+        cfg = self.params
+        L, H, Q, T = cfg.n_layers, cfg.n_heads, cfg.dim // cfg.n_heads, cfg.max_seq_len
+        flops_per_token = 6 * N + 12 * L * H * Q * T
+        flops_per_fwdbwd = flops_per_token * T
+        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
+        # express our flops throughput as ratio of A100 bfloat16 peak flops
+        flops_achieved = flops_per_iter * (1.0 / dt)  # per second
+        flops_promised = 312e12  # A100 GPU bfloat16 peak flops is 312 TFLOPS
+        mfu = flops_achieved / flops_promised
+        return mfu
+    @torch.inference_mode()
+    def generate(
+        self,
+        tokenizer: SmilesTokenizer,
+        context: Union[torch.Tensor, None] = None,
+        fragments: Union[torch.Tensor, None] = None,
+        max_length: int = 50,
+        num_gen: int = 200,
+        start_smiles: Union[str, None] = None,
+        temperature: float = 1.0,
+        top_k: Union[int, None] = None,
+        device: torch.device = torch.device("cpu"),
+        cache_kv: bool = False,
+    ) -> List[str]:
+        batch_size = num_gen
+        if start_smiles is not None:
+            tokenized_start_selfie = tokenizer.encode(start_smiles)[
+                :-1
+            ]  # remove <eos> token
+            tokenized_start_selfie = torch.tensor(
+                tokenized_start_selfie, device=device, dtype=torch.long
+            ).view(-1, 1)
+            tokenized_start_selfie = tokenized_start_selfie.repeat(1, batch_size)
+            outputs = tokenized_start_selfie.T
+        else:
+            outputs = (
+                torch.LongTensor([[tokenizer.cls_token_id] * batch_size]).to(device)
+            ).T  # batch_size
+        self.eval()
+        start_len = outputs.shape[1]
+        has_end_idx = np.array([0] * batch_size)
+        cache_id = np.random.randint(0, int(1e10), 1).item()
+        with torch.no_grad():
+            with tqdm(total=max_length, desc="Generation") as pbar:
+                for i in range(start_len, max_length):
+                    # trg_tensor = #torch.LongTensor(outputs).to(model.device)
+                    if not cache_kv:
+                        logits = self(outputs, context=context, fragment=fragments)
+                    else:
+                        # logits_ = self(outputs, context=context, fragment=fragments)
+                        if i == start_len:
+                            # When starting pass the whole input, so that "start_smiles" works, then only the newly generated token, because of the cache
+                            func_input = outputs
+                        else:
+                            func_input = outputs[:, -1].unsqueeze(-1)
+                        logits = self.forward_with_kvcache(
+                            func_input,
+                            context=context,
+                            fragment=fragments,
+                            cache_id=cache_id,
+                            pos_seq_len=outputs.size(-1),
+                        )
+                        # raise NotImplementedError("Currently not working / right implemented")
+                        # logits = self.forward_with_kvcache(outputs, context=context, fragment=fragments,cache_id = cache_id)
+                    logits = logits[:, -1, :]  # crop to just the final time step
+                    if temperature == 0.0:
+                        # "sample" the single most likely index
+                        _, logits = torch.topk(logits, k=1, dim=-1)
+                    else:
+                        # pluck the logits at the final step and scale by desired temperature
+                        logits = logits / temperature
+                        # optionally crop the logits to only the top k options
+                        if top_k is not None:
+                            v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                            logits[logits < v[:, [-1]]] = -float("Inf")
+                    probs = F.softmax(logits, dim=-1)
+                    idx_next = torch.multinomial(probs, num_samples=1)
+                    ended_sentences = idx_next == tokenizer.sep_token_id
+                    if torch.count_nonzero(ended_sentences) != 0:
+                        indicies = torch.nonzero(ended_sentences)
+                        indicies = indicies.cpu().numpy()
+                        for end_idx in indicies[:, 0]:
+                            if has_end_idx[end_idx] == 0:
+                                has_end_idx[end_idx] = i
+                        # print(has_end_idx)
+                    if all([idx != 0 for idx in has_end_idx]):
+                        break
+                    # outputs.append(best_guesses)
+                    # outputs = torch.row_stack((outputs, idx_next))
+                    outputs = torch.cat((outputs, idx_next), dim=1)
+                    pbar.update(1)
+        out_selfies = []
+        for output, end_idx in zip(outputs.cpu().numpy(), has_end_idx):
+            # Incase of limiting the max_len
+            if end_idx == 0:
+                selfie = [tokenizer._convert_id_to_token(idx) for idx in output[:]]
+            else:
+                selfie = [
+                    tokenizer._convert_id_to_token(idx) for idx in output[:end_idx]
+                ]
+            selfie = "".join(selfie[1:])
+            out_selfies.append(selfie)
+        # for indicies in outputs:
+        #     translated_sentence = [tokenizer.idx_to_tokens[idx]  for idx in outputs]
+        # remove start token
+        return out_selfies
+    @staticmethod
+    def load(path, device: torch.device = torch.device("cpu")) -> Transformer:
+        data = torch.load(path, map_location=device)
+        newinstace = Transformer(data["model_params"], data["context_params"])
+        newinstace.load_state_dict(data["state_dict"])
+        return newinstace.to(device)
+    def save(self, filepath):
+        torch.save(
+            {
+                "state_dict": self.state_dict(),
+                **dict(model_params=self.params, context_params=self.context_params),
+            },
+            filepath,
+        )
+    def getNumberTrainableParams(self) -> int:
+        return sum(p.numel() for p in self.parameters() if p.requires_grad)
+    def getNumberParams(self) -> int:
+        return sum(p.numel() for p in self.parameters())
+if __name__ == "__main__":
+    m = Transformer(
+        ModelArgs(dim=128, n_layers=8, n_heads=8, vocab_size=512, max_seq_len=1024),
+        context_params=ContextArgs(
+            context_keys=["logp", "sascore", "mol_weight"], context_dims=[1, 1, 1]
+        ),
+    )
+    seq = torch.ones((128, 50), dtype=torch.long)
+    frag = torch.ones((128, 10), dtype=torch.long)
+    context = {
+        "logp": torch.ones((128,), dtype=torch.float32),
+        # "sascore": torch.ones((128,), dtype=torch.float32),
+        "mol_weight": torch.ones((128,), dtype=torch.float32),
+    }
+    print(m.forward(seq, targets=seq, context=context, fragment=frag))

out/llama2-M-Full-RSS.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83571f8f8936a4eac8ac4541282ff99a3e942c07ee4aaef82abdc2f52e1731ae
+size 58587134

plot_utils.py ADDED Viewed

	@@ -0,0 +1,513 @@

+from typing import Dict, List, Union
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+import numpy as np
+import pandas as pd
+from rdkit.Chem import AllChem, Descriptors, RDConfig
+import sys
+sys.path.append(os.path.join(RDConfig.RDContribDir, "SA_Score"))
+# now you can import sascore!
+import sascorer
+from rdkit import Chem
+import logging
+logger = logging.getLogger(__name__)
+# plt.rcParams.update({'font.size': 13.1})
+plt.rcParams.update({"font.size": 12.5})
+COL_TO_DISPLAY_NAME = {
+    "logp": "LogP",
+    "sascore": "SAScore",
+    "mol_weight": "Molecular Weight",
+}
+def calcContextSAScore(smiles: List[str]):
+    sasc = []
+    for smi in smiles:
+        mol = Chem.MolFromSmiles(smi)
+        sa = sascorer.calculateScore(mol)
+        sasc.append(sa)
+    return np.array(sasc)
+def calcContextLogP(smiles: List[str]):
+    logps = []
+    for smi in smiles:
+        mol = Chem.MolFromSmiles(smi)
+        logp = Descriptors.MolLogP(mol)
+        logps.append(logp)
+    return np.array(logps)
+def calcContextEnergy(smiles, num_confs=5):
+    contexts = []
+    for smi in smiles:
+        # print("Calculating Energy:",smi)
+        mol = Chem.AddHs(Chem.MolFromSmiles(smi))
+        AllChem.EmbedMultipleConfs(mol, num_confs, numThreads=48)
+        generated_smiles = AllChem.MMFFOptimizeMoleculeConfs(mol, numThreads=48)
+        energies = []
+        for coverged, energy in generated_smiles:
+            if coverged != 0:
+                print("Not converged!", smi)
+            energies.append(energy)
+        # print(energy)
+        # kcal/mol
+        mean_en = np.mean(energies)
+        # to hartree
+        mean_en = mean_en * 0.0016
+        contexts.append(mean_en)
+    return np.array(contexts)
+def calcContextMolWeight(smiles: List[str]):
+    con = []
+    for _, smi in enumerate(smiles):
+        mol = Chem.MolFromSmiles(smi)
+        c = Descriptors.ExactMolWt(mol) / 100
+        con.append(c)
+    return np.array(con)
+def plot_1D_condition(
+    context_col,
+    save_path,
+    new_context,
+    generated_smiles,
+    temperature,
+    context_dict,
+    context_scaler=None,
+):
+    for con_col in context_col:
+        save_path = os.path.join(
+            save_path, f"{con_col}_{'-'.join(context_col)}_temp{temperature}"
+        )
+        os.makedirs(save_path, exist_ok=True)
+        current_context = new_context[con_col].cpu().detach().numpy()
+        if con_col == "mol_weight":
+            predicted_context = calcContextMolWeight(generated_smiles)
+        elif con_col == "logp":
+            predicted_context = calcContextLogP(generated_smiles)
+        elif con_col == "sascore":
+            predicted_context = calcContextSAScore(generated_smiles)
+        elif con_col == "energy":
+            # TODO: Change to something better
+            predicted_context = calcContextEnergy(generated_smiles)
+        if context_scaler is not None:
+            raise NotImplementedError("Not implemented yet")
+            # context_list = context_scaler.inverse_transform(context_list)
+        mean_vals_pred = []
+        labels = np.unique(current_context)
+        mse_value = []
+        mad_value = []
+        for label in labels:
+            mask = (current_context == label).reshape(-1)
+            mean_val = np.mean(predicted_context[mask])
+            mean_vals_pred.append(mean_val)
+            mse_value.extend((predicted_context[mask] - label) ** 2)
+            mad_value.extend(abs(predicted_context[mask] - label))
+        mse = np.mean(mse_value)
+        mad = np.mean(mad_value)
+        logger.info(f"MSE {mse}")
+        logger.info(f"MAD {mad}")
+        logger.info(f"SD: {np.std(mad_value)}")
+        current_context = current_context.reshape(-1)
+        # Create a figure and axes
+        fig, ax1 = plt.subplots()
+        # Scatter plot
+        ax1.scatter(
+            current_context,
+            predicted_context,
+            label="Ground Truth vs Prediction",
+            c="blue",
+            alpha=0.5,
+        )
+        ax1.plot(
+            np.arange(np.min(current_context), np.max(current_context) + 1),
+            np.arange(np.min(current_context), np.max(current_context) + 1),
+            label="y=x",
+            c="black",
+        )
+        ax1.scatter(labels, mean_vals_pred, label="Mean predicted values", c="red")
+        ax1.set_xlabel("Ground Truth")
+        ax1.set_ylabel("Prediction")
+        # Histogram
+        ax2 = ax1.twinx()  # Create a twin Axes sharing the x-axis
+        sns.histplot(
+            context_dict[con_col],
+            # bins=200,
+            label="Dataset distribution",
+            alpha=0.5,
+            # kde=True,
+            # element="poly",
+            ax=ax2,
+        )
+        # ax2.hist(
+        #     context_dict[con_col],
+        #     bins=200,
+        #     label="Dataset distribution",
+        #     alpha=0.5,
+        # )
+        ax2.set_ylabel("Frequency")
+        # Combine legends
+        handles1, labels1 = ax1.get_legend_handles_labels()
+        handles2, labels2 = ax2.get_legend_handles_labels()
+        ax1.legend(handles1 + handles2, labels1 + labels2)
+        plt.xlim((np.min(current_context), np.max(current_context) + 1))
+        # Set title
+        display_name = COL_TO_DISPLAY_NAME[con_col]
+        plt.title(f"{display_name} - temperature: {temperature} - mse: {round(mse, 4)}")
+        out_df = pd.DataFrame(
+            {
+                "smiles": generated_smiles,
+                f"{con_col}": predicted_context.tolist(),
+                f"target_{con_col}": current_context.tolist(),
+            }
+        )
+        out_df.to_csv(os.path.join(save_path, "predictions.csv"), index=False)
+        out_path = os.path.join(save_path, "graph.png")
+        print(f"Saved to {out_path}")
+        plt.savefig(out_path)
+        plt.clf()
+def plot_2D_condition(
+    context_col,
+    save_path,
+    new_context,
+    generated_smiles,
+    temperature,
+    label: Union[str, None] = None,
+):
+    save_path = os.path.join(
+        save_path, f"multicond2_{'-'.join(context_col)}_temp={temperature}"
+    )
+    if label is not None:
+        save_path = os.path.join(save_path, label)
+    os.makedirs(save_path, exist_ok=True)
+    delta_dict = {c: [] for c in context_col}
+    predicted_context_dict = {}
+    for con_col in context_col:
+        current_context = new_context[con_col].cpu().numpy()
+        if con_col == "mol_weight":
+            predicted_context = calcContextMolWeight(generated_smiles)
+        elif con_col == "logp":
+            predicted_context = calcContextLogP(generated_smiles)
+        elif con_col == "sascore":
+            predicted_context = calcContextSAScore(generated_smiles)
+        elif con_col == "energy":
+            # TODO: Change to something better
+            predicted_context = calcContextEnergy(generated_smiles)
+        predicted_context_dict[con_col] = np.array(predicted_context)
+        delta_dict[con_col] = np.abs(current_context - np.array(predicted_context))
+        # Create a DataFrame from delta_dict
+    df = pd.DataFrame(delta_dict)
+    real_values_prop1 = new_context[context_col[0]].cpu().numpy()
+    real_values_prop2 = new_context[context_col[1]].cpu().numpy()
+    # cmap = plt.get_cmap('Blues')  # Choose a green color palette from Matplotlib
+    mse_vals_x = []
+    mad_vals_x = []
+    mse_vals_y = []
+    mad_vals_y = []
+    fig = plt.figure()
+    ax = plt.subplot(111)
+    for v1 in np.unique(real_values_prop1):
+        for v2 in np.unique(real_values_prop2):
+            mask = (real_values_prop1 == v1) & (real_values_prop2 == v2)
+            indices = np.nonzero(mask)[0]
+            # print("Indices", len(indices))
+            # Get the color from the color palette based on the v1 value
+            # color = cmap((v1 - np.min(real_values_prop1)) / (np.max(real_values_prop1) - np.min(real_values_prop1)))
+            color = np.random.rand(
+                3,
+            )
+            # # Plot scatter plot with the specified color and label
+            x_pred = predicted_context_dict[context_col[0]][indices].ravel()
+            y_pred = predicted_context_dict[context_col[1]][indices].ravel()
+            mse_vals_x.extend((x_pred - v1) ** 2)
+            mad_vals_x.extend(np.abs(x_pred - v1))
+            mse_vals_y.extend((y_pred - v2) ** 2)
+            mad_vals_y.extend(np.abs(y_pred - v2))
+            ax.scatter(x_pred, y_pred, color=color, alpha=0.5)
+            # Plot KDE plot with the specified color
+            # sns.kdeplot(
+            #     data=pd.DataFrame(
+            #         {
+            #             f"x": x_pred,
+            #             f"y": y_pred,
+            #         }
+            #     ),
+            #     x=f"x",
+            #     y=f"y",
+            #     color=color,
+            #     fill=False,
+            #     bw_adjust=2.25,
+            #     # label=f"({v1}, {v2})"
+            # )
+            ax.scatter(v1, v2, color=color, label=f"({v1}, {v2})", marker="^", s=20.0)
+    mse_x = np.mean(mse_vals_x)
+    mad_x = np.mean(mad_vals_x)
+    mse_y = np.mean(mse_vals_y)
+    mad_y = np.mean(mad_vals_y)
+    logger.info(f"MSE {context_col[0]}: {mse_x}")
+    logger.info(f"MAD {context_col[0]}: {mad_x}")
+    logger.info(f"MSE {context_col[1]}: {mse_y}")
+    logger.info(f"MAD {context_col[1]}: {mad_y}")
+    file_path = os.path.join(save_path, "metrics.txt")
+    with open(file_path, "w") as f:
+        f.write(f"MSE {context_col[0]}: {mse_x} \n")
+        f.write(f"MAD {context_col[0]}: {mad_x} \n")
+        f.write(f"MSE {context_col[1]}: {mse_y} \n")
+        f.write(f"MAD {context_col[1]}: {mad_y} \n")
+    ax.set_xlabel(COL_TO_DISPLAY_NAME[context_col[0]])
+    ax.set_ylabel(COL_TO_DISPLAY_NAME[context_col[1]])
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+    # Put a legend to the right of the current axis
+    ax.legend(loc="center left", bbox_to_anchor=(1, 0.5))
+    ax.set_title("Multi Property Distribution of Generated Molecules")
+    out_path = os.path.join(save_path, "graph.png")
+    logger.info(f"Saved to {out_path}")
+    plt.savefig(out_path)
+    plt.clf()
+    return save_path
+def plot_3D_condition(
+    context_col, save_path, new_context, generated_smiles, temperature
+):
+    save_path = os.path.join(
+        save_path, f"multicond3_{'-'.join(context_col)}_temp={temperature}"
+    )
+    os.makedirs(save_path, exist_ok=True)
+    predicted_context_dict = {}
+    for con_col in context_col:
+        predicted_context = calc_context_from_smiles(generated_smiles, con_col)
+        predicted_context_dict[con_col] = np.array(predicted_context)
+    real_values_prop1 = new_context[context_col[0]].cpu().numpy()
+    real_values_prop2 = new_context[context_col[1]].cpu().numpy()
+    real_values_prop3 = new_context[context_col[2]].cpu().numpy()
+    # cmap = plt.get_cmap('Blues')  # Choose a green color palette from Matplotlib
+    mse_vals_x = []
+    mad_vals_x = []
+    mse_vals_y = []
+    mad_vals_y = []
+    mse_vals_z = []
+    mad_vals_z = []
+    fig = plt.figure()
+    ax = fig.add_subplot(projection="3d")
+    for v1 in np.unique(real_values_prop1):
+        for v2 in np.unique(real_values_prop2):
+            for v3 in np.unique(real_values_prop3):
+                mask = (
+                    (real_values_prop1 == v1)
+                    & (real_values_prop2 == v2)
+                    & (real_values_prop3 == v3)
+                )
+                indices = np.nonzero(mask)[0]
+                # print("Indices", len(indices))
+                # Get the color from the color palette based on the v1 value
+                # color = cmap((v1 - np.min(real_values_prop1)) / (np.max(real_values_prop1) - np.min(real_values_prop1)))
+                color = np.random.rand(
+                    3,
+                )
+                x_pred = predicted_context_dict[context_col[0]][indices].ravel()
+                y_pred = predicted_context_dict[context_col[1]][indices].ravel()
+                z_pred = predicted_context_dict[context_col[2]][indices].ravel()
+                mse_vals_x.extend((x_pred - v1) ** 2)
+                mad_vals_x.extend(np.abs(x_pred - v1))
+                mse_vals_y.extend((y_pred - v2) ** 2)
+                mad_vals_y.extend(np.abs(y_pred - v2))
+                mse_vals_z.extend((z_pred - v3) ** 2)
+                mad_vals_z.extend(np.abs(z_pred - v3))
+                # # Plot scatter plot with the specified color and label
+                ax.scatter(v1, v2, v3, color=color, label=f"({v1}, {v2}, {v3})", s=20.0)
+                ax.scatter(
+                    x_pred,
+                    y_pred,
+                    z_pred,
+                    color=color,
+                )
+    mse_x = np.mean(mse_vals_x)
+    mad_x = np.mean(mad_vals_x)
+    mse_y = np.mean(mse_vals_y)
+    mad_y = np.mean(mad_vals_y)
+    mse_z = np.mean(mse_vals_z)
+    mad_z = np.mean(mad_vals_z)
+    logger.info(f"MSE {context_col[0]}: {mse_x}")
+    logger.info(f"MAD {context_col[0]}: {mad_x}")
+    logger.info(f"MSE {context_col[1]}: {mse_y}")
+    logger.info(f"MAD {context_col[1]}: {mad_y}")
+    logger.info(f"MSE {context_col[2]}: {mse_z}")
+    logger.info(f"MAD {context_col[2]}: {mad_z}")
+    file_path = os.path.join(save_path, "metrics.txt")
+    with open(file_path, "w") as f:
+        f.write(f"MSE {context_col[0]}: {mse_x} \n")
+        f.write(f"MAD {context_col[0]}: {mad_x} \n")
+        f.write(f"MSE {context_col[1]}: {mse_y} \n")
+        f.write(f"MAD {context_col[1]}: {mad_y} \n")
+        f.write(f"MSE {context_col[2]}: {mse_z} \n")
+        f.write(f"MAD {context_col[2]}: {mad_z} \n")
+    ax.set_xlabel(COL_TO_DISPLAY_NAME[context_col[0]])
+    ax.set_ylabel(COL_TO_DISPLAY_NAME[context_col[1]])
+    ax.set_zlabel(COL_TO_DISPLAY_NAME[context_col[2]])
+    # plt.legend(
+    #     bbox_to_anchor=(1.0, 0.5),
+    #     loc="center right",
+    #     bbox_transform=plt.gcf().transFigure,
+    # )
+    # plt.subplots_adjust(left=0.05, bottom=0.1, right=0.8)
+    plt.legend(
+        bbox_to_anchor=(1.035, 0.5),
+        loc="center right",
+        bbox_transform=plt.gcf().transFigure,
+    )
+    plt.subplots_adjust(left=0.05, bottom=0.1, right=0.775)
+    plt.title("Multi Property Distribution of Generated Molecules")
+    out_path = os.path.join(save_path, "graph.png")
+    print(f"Saved to {out_path}")
+    plt.savefig(out_path)
+    plt.clf()
+    return save_path
+def calc_context_from_smiles(generated_smiles, con_col):
+    if con_col == "mol_weight":
+        predicted_context = calcContextMolWeight(generated_smiles)
+    elif con_col == "logp":
+        predicted_context = calcContextLogP(generated_smiles)
+    elif con_col == "sascore":
+        predicted_context = calcContextSAScore(generated_smiles)
+    elif con_col == "energy":
+        # TODO: Change to something better
+        predicted_context = calcContextEnergy(generated_smiles)
+    return predicted_context
+def plot_unconditional(
+    out_path: str = os.getcwd(),
+    smiles: List[str] = [],
+    temperature: float = 0.8,
+    cmp_context_dict: Union[Dict[str, np.array], None] = None,
+    context_cols: List[str] = ["logp", "sascore", "mol_weight"],
+):
+    out_path = os.path.join(out_path, "unconditional")
+    os.makedirs(out_path, exist_ok=True)
+    for c in context_cols:
+        plt.clf()
+        context_cal = calc_context_from_smiles(smiles, c)
+        if cmp_context_dict is not None:
+            sns.histplot(
+                cmp_context_dict[c],
+                stat="density",
+                label="Dataset Distribution",
+                alpha=0.75,
+                color="blue",
+            )
+        sns.histplot(
+            context_cal,
+            stat="density",
+            label="Generated Molecules Distribution",
+            alpha=0.5,
+            color="orange",
+        )
+        if c == "logp":
+            plt.xlim((-6, 8))
+        else:
+            plt.xlim((0, 10))
+        plt.xlabel(COL_TO_DISPLAY_NAME[c])
+        plt.title(
+            f"Unconditional Distribution {COL_TO_DISPLAY_NAME[c]} \nwith Temperature {temperature}"
+        )
+        plt.legend()
+        out_file = os.path.join(out_path, f"unc_{c}_temp={temperature}.png")
+        plt.savefig(out_file)
+        logger.info(f"Saved Unconditional to {out_file}")
+def novelty(gen, train):
+    gen_smiles_set = set(gen) - {None}
+    train_set = set(train)
+    return len(gen_smiles_set - train_set) / len(gen_smiles_set)
+def unique_at(gen, k=1000):
+    gen = gen[:k]
+    return len(set(gen)) / len(gen)
+def check_metrics(generated_smiles: List[str], dataset_smiles: List[str]):
+    len_before = len(generated_smiles)
+    generated_smiles = [g for g in generated_smiles if g is not None]
+    len_after = len(generated_smiles)
+    novel = novelty(generated_smiles, dataset_smiles)
+    unique_at_1k = unique_at(generated_smiles, k=1000)
+    unique_at_10k = unique_at(generated_smiles, k=10000)
+    return dict(
+        novelty=novel,
+        unique_at_1k=unique_at_1k,
+        unique_at_10k=unique_at_10k,
+        validity=len_after / float(len_before),
+    )

preprocess_dataset.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import argparse
+import json
+import os
+import pickle
+import random
+from functools import partial
+import pandas as pd
+import numpy as np
+import requests
+import torch
+import torch.distributed as dist
+from tqdm import tqdm
+import multiprocessing
+from multiprocessing import Pool
+from fragment_creator import BaseFragmentCreator, BricksFragmentCreator, Fragment
+from tokenizer import SmilesTokenizer
+from torch.utils.data.distributed import DistributedSampler
+from rdkit import Chem
+from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmiles
+from tqdm.contrib.concurrent import process_map, thread_map
+from typing import List
+import swifter
+DATA_CACHE_DIR = "data"
+def _tokenize_smiles(
+    smi: List[str],
+    tokenizer: SmilesTokenizer = None,
+    max_smiles_len=256,
+    log_output=True,
+):
+    # try:
+    tokens = tokenizer.encode(smi)
+    if len(tokens) > max_smiles_len:
+        if log_output:
+            print(f"Removing to long {smi} with smiles len of {len(tokens)} ")
+        return None
+    return tokens
+    # except Exception as e:
+    #     print(e)
+    #     return None
+def _tokenize_scaffolds(smi: str, tokenizer=None, max_smiles_len=256, log_output=True):
+    # try:
+    smi = MurckoScaffoldSmiles(smi)
+    tokens = tokenizer.encode(smi)
+    tokens = tokens[1:-1]  # remove [SEP] and [CLS] tokens
+    if len(tokens) > max_smiles_len:
+        if log_output:
+            print(f"Removing to long {smi} with smiles len of {len(tokens)} ")
+        return None
+    return tokens
+    # except Exception as e:
+    #     print(e)
+    #     return None
+def pad_batch(src, pad_idx):
+    max_len = max([len(d) for d in src])
+    # src = [d["src_input_ids"] for d in data]
+    padded_src = np.ones([len(src), max_len]) * pad_idx
+    for i, j in enumerate(src):
+        padded_src[i][0 : len(j)] = j
+    # try to predict the next token from the previouse tokens
+    # essentially reconstructing the src sentence from the embeddings and the previous sentence
+    padded_src = padded_src.T
+    return padded_src
+def pretokenize(
+    data_file=os.path.join(
+        DATA_CACHE_DIR, "FULL_combined_zinc_pubchemqc_qm9_pc9_reddb_chembl.parquet"
+    ),
+    tokenizer=SmilesTokenizer(),
+    limit=None,
+    context=["logp", "sascore", "mol_weight"],
+    out_name: str = "processed_dataset",
+    remove_nan_context_rows: bool = False,
+):
+    df = pd.read_parquet(data_file)
+    if limit is not None:
+        # smiles_list = df.smiles[:limit]
+        df = df.sample(n=limit)  # df[:limit]
+        # NOTE: Set here if necessary, but for memory efficiency not duplicating millions of smiles
+        # smiles_list = df.smiles
+    else:
+        # shuffle the rows
+        df = df.sample(frac=1.0)
+    cpu_count = (
+        multiprocessing.cpu_count()
+    )  # min(int(multiprocessing.cpu_count() * 0.8), 8)
+    print(f"Running on {cpu_count} CPUs ")
+    tqdm.pandas()
+    df["scaffolds"] = df["smiles"].progress_map(lambda s: None if "." in s else s)
+    df["smiles"] = df["scaffolds"].copy()
+    orig_len = len(df)
+    if context is not None:
+        if df.get("origin") is not None:
+            origins = df["origin"].unique()
+            origin_dics = {}
+            for i, o in enumerate(origins):
+                df.loc[df["origin"] == o, "origin"] = i
+                origin_dics[o] = i
+            df["origin"] = df["origin"].astype(float)
+            with open(
+                os.path.join(
+                    DATA_CACHE_DIR, os.path.basename(data_file) + "_origins.json"
+                ),
+                "w",
+            ) as f:
+                json.dump(origin_dics, f)
+        mask = (
+            ~df["smiles"].isna()
+            & (
+                (~df[context].isna()).all(axis=1)
+                if remove_nan_context_rows
+                else np.ones(len(df["smiles"]), dtype=bool)
+            )
+            & ~df["scaffolds"].isna()
+        )
+    else:
+        mask = ~df["smiles"].isna()
+    error_count = np.count_nonzero(~mask)
+    df = df[mask]
+    # print("HELLO")
+    # print("***"*10)
+    # tokenizer.batch_encode_plus()
+    # df["scaffolds"] = df["scaffolds"].swifter.apply(
+    #     partial(_tokenize_scaffolds, tokenizer=tokenizer, log_output=False)
+    # )
+    # df["scaffolds"] = df["scaffolds"].swifter.apply(
+    #     partial(_tokenize_scaffolds, tokenizer=tokenizer, log_output=False)
+    # )
+    df["tokens"] = df["smiles"].swifter.apply(
+        partial(_tokenize_smiles, tokenizer=tokenizer, log_output=False)
+    )
+    df["scaffolds"] = df["tokens"].copy()
+    mask = ~df["tokens"].isna() & ~df["scaffolds"].isna()
+    df = df[mask]
+    error_count += np.count_nonzero(~mask)
+    # Shuffle the data
+    df = df.sample(frac=1).reset_index(drop=True)
+    # with Pool(cpu_count) as p:
+    #     df["scaffolds"] = list(
+    #             p.map(partial( _tokenize_scaffolds ,tokenizer=tokenizer, log_output=False), tqdm(df.smiles.to_numpy(),total=len(df)), chunksize=1000),
+    #     )
+    #     df["smiles"] = list(
+    #             p.map(partial( _tokenize_smiles ,tokenizer=tokenizer, log_output=False), tqdm(df.smiles.to_numpy(),total=len(df)), chunksize=1000),
+    #     )
+    if context is not None:
+        context_list = df[context].to_numpy()
+        context_dict = {k: context_list[:, i] for i, k in enumerate(context)}
+    else:
+        context_dict = {}
+    print(f"Error count: {error_count} / {orig_len} = {error_count/orig_len}")
+    cache_path = os.path.join(os.path.dirname(__file__), ".cache")
+    os.makedirs(cache_path, exist_ok=True)
+    out_path = os.path.join(cache_path, f"{out_name}_{limit}.pkl")
+    with open(out_path, "wb") as f:
+        pickle.dump(
+            {
+                "tokens": df["tokens"].tolist(),
+                "smiles": df["smiles"].tolist(),
+                "scaf": df["scaffolds"].tolist(),
+                **context_dict,
+            },
+            f,
+        )
+    print(f"Saved to {out_path}")
+    print("Done.")
+class PretokDataset(torch.utils.data.Dataset):
+    """Loads pretokenized example from disk and returns them as PyTorch tensors."""
+    def __init__(self, split, pad_token_id, dataset="processed_dataset.pkl"):
+        super().__init__()
+        self.split = split
+        self.dataset = dataset
+        self.pad_token_id = pad_token_id
+        cache_path = os.path.join(os.path.dirname(__file__), ".cache")
+        with open(os.path.join(cache_path, self.dataset), "rb") as f:
+            self.data_dict = pickle.load(f)
+        # split out 10% of the data for validation
+        split_ix = int(len(self.data_dict["tokens"]) * 0.9)
+        if self.split == "train":
+            self.data_dict = {k: self.data_dict[k][:split_ix] for k in self.data_dict}
+        elif self.split == "val":
+            self.data_dict = {k: self.data_dict[k][split_ix:] for k in self.data_dict}
+        else:
+            raise RuntimeError(f"Could not find split for: self.split={self.split}")
+    def __len__(self):
+        return len(self.data_dict["tokens"])
+    def __getitem__(self, idx):
+        m = self.data_dict
+        start = idx
+        end = idx + 1
+        # calling .astype will copy the data into a new numpy array, now in RAM
+        padded_tokens = pad_batch(m["tokens"][start:end], self.pad_token_id)
+        chunk = torch.from_numpy((padded_tokens).astype(np.int64))
+        padded_scaffolds = torch.from_numpy(
+            pad_batch(m["scaf"][start:end], self.pad_token_id).astype(np.int64)
+        )
+        item = {
+            "seq": chunk,
+            "scaf": padded_scaffolds,
+            "smiles": m["smiles"][start:end],
+            **{
+                k: torch.tensor(m[k][start:end], dtype=torch.float32)
+                for k in m
+                if k != "scaf" and k != "tokens" and k != "smiles"
+            },
+        }
+        return item
+def padding_collate_fn(
+    data, tokenizer: SmilesTokenizer, fragment_creator: BaseFragmentCreator
+):
+    # data = list of dicts
+    pad_idx = tokenizer.pad_token_id
+    src = [d["seq"] for d in data]
+    max_len = max([len(d) for d in src])
+    padded_src = np.ones([len(src), max_len]) * pad_idx
+    for i, j in enumerate(src):
+        padded_src[i][0 : len(j)] = j.ravel()
+    if fragment_creator is None:
+        smiles_context = [d["scaf"] for d in data]
+    else:
+        # Remove start and end token after tokenization with [1:-1  ]
+        smiles_context = []
+        for d in data:
+            s = d["smiles"][0]
+            tokens = d["seq"]
+            frag = fragment_creator.create_fragment(Fragment(smiles=s, tokens=tokens))
+            if frag.tokens is not None:
+                smiles_context.append(frag.tokens)
+            else:
+                smiles_context.append(
+                    torch.tensor(
+                        tokenizer.encode(frag.smiles)[1:-1],
+                        dtype=torch.long,
+                        device=tokens.device,
+                    )
+                )
+    max_len_ctx = max([len(d) for d in smiles_context])
+    padded_smiles_context = np.ones([len(smiles_context), max_len_ctx]) * pad_idx
+    for i, j in enumerate(smiles_context):
+        padded_smiles_context[i][0 : len(j)] = j.ravel()
+    # try to predict the next token from the previouse tokens
+    # essentially reconstructing the src sentence from the embeddings and the previous sentence
+    padded_src = padded_src.T
+    original_context_keys = [
+        k for k in data[0].keys() if k != "seq" and k != "scaf" and k != "smiles"
+    ]
+    context_out_dict = {k: [] for k in original_context_keys}
+    for k in original_context_keys:
+        val_list = []
+        for d in data:
+            val_list.append(d[k])
+        context_out_dict[k] = torch.concat(val_list, dim=0)
+    return {
+        "src": torch.tensor(padded_src, dtype=torch.long),  # for (seq_len, batch_size)
+        "fragment": torch.tensor(padded_smiles_context.T, dtype=torch.long),
+        "context": context_out_dict,
+    }
+class SmilesTask:
+    @staticmethod
+    def iter_batches(
+        split,
+        batch_size,
+        device,
+        context_keys: List[str],
+        num_workers=0,
+        dataset="processed_dataset.pkl",
+        fragment_creator: BaseFragmentCreator = BricksFragmentCreator(),
+    ):
+        tokenizer = SmilesTokenizer()
+        ds = PretokDataset(split, tokenizer.pad_token_id, dataset=dataset)
+        is_ddp = int(os.environ.get("RANK", -1)) != -1
+        dl = torch.utils.data.DataLoader(
+            ds,
+            batch_size=batch_size,
+            pin_memory=True,
+            num_workers=num_workers,
+            shuffle=False,
+            sampler=DistributedSampler(ds) if is_ddp else None,
+            collate_fn=lambda batch: padding_collate_fn(
+                batch, tokenizer, fragment_creator
+            ),
+        )
+        for data in dl:
+            data["src"] = data["src"].to(device, non_blocking=True)
+            data["tgt"] = data["src"].to(device, non_blocking=True)
+            data["src"] = data["src"][:-1, :].T  # batch_size, seq_len
+            data["tgt"] = data["tgt"][1:, :].T  # batch_size, seq_len
+            data["fragment"] = (
+                data["fragment"].to(device, non_blocking=True).T
+            )  # batch_size, seq_len
+            keys = list(data["context"].keys())
+            for d in keys:
+                if d not in context_keys:
+                    del data["context"][d]
+                else:
+                    data["context"][d] = data["context"][d].to(
+                        device, non_blocking=True
+                    )
+            yield data
+if __name__ == "__main__":
+    pretokenize(
+        data_file=os.path.join(
+            DATA_CACHE_DIR,
+            "OrganiX13.parquet",
+        ),
+        limit=None,  # Set how many molecules should be processed, if None all molecules will be processed,
+        context=["logp", "sascore", "mol_weight"],
+        out_name="processed_dataset",
+        remove_nan_context_rows=False,
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+numpy==1.23.5
+pytest==7.4.0
+Requests==2.31.0
+sentencepiece==0.1.99
+tiktoken==0.3.3
+torch==2.0.1
+tqdm==4.64.1
+wandb==0.15.5

sample.py ADDED Viewed

	@@ -0,0 +1,616 @@

+import os
+from contextlib import nullcontext
+import sys
+import time
+import pandas as pd
+import torch
+from tqdm.auto import tqdm
+# from tqdm.notebook import tqdm
+from model import Transformer
+from plot_utils import (
+    check_metrics,
+    plot_1D_condition,
+    plot_2D_condition,
+    plot_3D_condition,
+    plot_unconditional,
+)
+from tokenizer import SmilesTokenizer
+import numpy as np
+from typing import Dict, List, Tuple, Union
+import re
+from rdkit import Chem
+from rdkit import DataStructs
+from rdkit.Chem.Fingerprints import FingerprintMols
+import logging
+logger = logging.getLogger(__name__)
+class Sampler:
+    def __init__(
+        self,
+        load_path: str,
+        device: str = "cpu",
+        seed: int = 1337,
+        dtype: str = "float16",
+        compile: bool = True,
+        quantize: bool = False,
+    ) -> None:
+        self.load_path = load_path
+        self.device = device
+        self.dtype = dtype
+        self.compile = compile
+        self.quantize = quantize
+        self.seed = seed
+        self._init_model()
+    def _init_model(self):
+        np.random.seed(self.seed)
+        torch.cuda.manual_seed(self.seed)
+        torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
+        torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
+        self.device_type = (
+            "cuda" if "cuda" in self.device else "cpu"
+        )  # for later use in torch.autocast
+        ptdtype = {
+            "float32": torch.float32,
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+        }[self.dtype]
+        self.ptdtype = ptdtype
+        self.ctx = self._autocast()
+        # init from a model saved in a specific directory
+        # ckpt_path = os.path.join(out_dir, "ckpt_full_dim=256.pt")
+        self.model = Transformer.load(self.load_path, device=self.device)
+        self.model.eval()
+        if self.quantize:
+            raise NotImplementedError("Not properly implemented for CPU / GPU")
+            self.model = torch.ao.quantization.quantize_dynamic(
+                self.model,  # the original model
+                {torch.nn.Linear},  # a set of layers to dynamically quantize
+                dtype=torch.qint8,
+            )
+        if self.compile:
+            logger.info("Compiling the model...")
+            self.model = torch.compile(self.model)  # requires PyTorch 2.0 (optional)
+        self.model = self.model.to(self.device)
+        # load the tokenizer
+        self.tokenizer = SmilesTokenizer()
+    def get_context(
+        self,
+        context_col: List[str],
+        context_smi: str,
+        num_examples: int = 50,
+    ):
+        """
+        Returns a dictionary in the form of
+        {
+        "fragment": torch.tensor,
+        "context": {
+            "logp": torch.tensor,
+            "sascore": torch.tensor,
+            "mol_weight": torch.tensor
+        }
+        }
+        When context_smi is set to a string, then the "fragment" field is populated.
+        All of the properties listed in the context_col list is set to the keys and the values are set to a resonable range for each property.
+        num_examples indicates how many values are sampled for each property.
+        """
+        output_dict = {"context": {}, "fragment": None}
+        if context_smi is not None:
+            logger.debug(
+                f"context_smiles: {context_smi}",
+            )
+            # NOTE: Remove beginning [CLS] and end token [SEP]
+            incorporate_selfie = self.tokenizer.encode(context_smi)[1:-1]
+            context = torch.tensor(
+                [incorporate_selfie] * num_examples,
+                dtype=torch.long,
+                device=self.device,
+            )
+            output_dict["fragment"] = context
+        if context_col is None:
+            return output_dict
+        if "logp" in context_col:
+            # context = 0.5 * torch.randint(
+            #     -8, 14, (num_examples,), device=self.device, dtype=torch.float
+            # )
+            # context = 0.5 * torch.randint(
+            #     -6, 6, (num_examples, 1), device=device, dtype=torch.float
+            # )
+            context = torch.tensor(
+                np.random.choice([-2, 0, 2], (num_examples,)),
+                device=self.device,
+                dtype=self.ptdtype,
+            )
+            # context = 2.0 * torch.ones(
+            #     (num_examples,1), device=device, dtype=torch.float
+            # )
+            # context = -2.0*torch.ones((num_examples,2),device=device,dtype=torch.float)
+            # context, _ = torch.sort(context, 0)
+            output_dict["context"]["logp"] = context
+        if "energy" in context_col:
+            context = 0.1 * torch.randint(
+                -15, 15, (num_examples,), device=self.device, dtype=torch.float
+            )
+            # context = -2.0*torch.ones((num_examples,2),device=device,dtype=torch.float)
+            context, _ = torch.sort(context, 0)
+            output_dict["context"]["energy"] = context
+        if "sascore" in context_col:
+            # context = 0.5 * torch.randint(
+            #     2, 20, (num_examples, ), device=self.device, dtype=torch.float
+            # )
+            context = torch.tensor(
+                np.random.choice([2, 3, 4], (num_examples,)),
+                device=self.device,
+                dtype=torch.float,
+            )
+            # context = 0.5 * torch.randint(
+            #     4, 8, (num_examples, 1), device=device, dtype=torch.float
+            # )
+            # context = 2.0*torch.ones((num_examples,1),device=device,dtype=torch.float)
+            # context, _ = torch.sort(context, 0)
+            output_dict["context"]["sascore"] = context
+        if "mol_weight" in context_col:
+            # context = 0.5 * torch.randint(
+            #     2, 20, (num_examples,), device=self.device, dtype=torch.float
+            # )
+            context = torch.tensor(
+                np.random.choice([2.0, 3.0, 4.0], (num_examples,)),
+                device=self.device,
+                dtype=torch.float,
+            )
+            # context = 0.5 * torch.randint(
+            #     2, 20, (num_examples, 1), device=device, dtype=torch.float
+            # )
+            # context = 2.5*torch.ones((num_examples,1),device=device,dtype=torch.float)
+            # context, _ = torch.sort(context, 0)
+            output_dict["context"]["mol_weight"] = context
+        return output_dict
+    def _autocast(self):
+        if "cuda" in self.device:
+            if self.dtype == "bfloat16" and torch.cuda.is_bf16_supported():
+                return torch.cuda.amp.autocast(dtype=torch.bfloat16)
+            elif self.dtype == "float16":
+                return torch.cuda.amp.autocast(dtype=torch.float16)
+            else:
+                return torch.cuda.amp.autocast(dtype=torch.float32)
+        else:  # cpu
+            return nullcontext()
+    @torch.no_grad()
+    def generate(
+        self,
+        context_cols: Union[List[str], None, Dict[str, torch.Tensor]] = None,
+        context_smi: Union[str, None] = None,
+        start_smiles: Union[str, None] = None,
+        num_samples: int = 50,
+        max_new_tokens: int = 256,
+        temperature: float = 1.0,
+        top_k: Union[int, None] = None,
+        return_context: bool = False,
+        total_gen_steps: int = 1,
+        use_kv_cache: bool = False,
+    ) -> Union[List[str], Tuple[List[str], List[float]]]:
+        """
+        Generates a list of SMILES. With the default options it would generate them unconditionally.
+        Params:
+            - context_cols : When a list the context is randomly sampled from the get_context method, when given a dictionary the
+                             context values are taken from the dictionary instead.
+            - context_smi : Further conditioning by the usage of a molecular fragment
+            . start_smiles : Can be used to start the SMILES with a specific string, the model then generates the next tokens including that start sequence.
+            - num_samples : Controlls how many SMILES in total will be generated be the model.
+            - max_new_tokens : Controlls the maximum length of each SMILES (in tokens) that is generated.
+            - temperature: Controlls the randomness of the model. A temperature = 1.0 means it is the trained distribution. A temperature < 1 is more deterministic and temperature > 1 is more random
+            - top_k : Clamps the probability distribution to the top k tokens. From these the next token is then sampled from.
+            - return_context : Whether the context that was given to the model should be returned.
+            - total_gen_steps : In how many sub steps the generation should be split up to. Useful when generation 10k + SMILES and wanting to chunk these into for example 10 * 1k generations with total_gen_steps = 10.
+            - use_kv_cache: Runs the generation using kv-caching. It is faster, but takes more memory.
+        """
+        with self.ctx:
+            gens_per_step = num_samples // total_gen_steps
+            logger.debug(f"Gens per Step: {gens_per_step}")
+            context = None  # {"context": None, "fragment" : None}
+            out_smiles = []
+            with tqdm(total=total_gen_steps, desc="Batch") as pbar:
+                for i in range(total_gen_steps):
+                    if isinstance(context_cols, dict):
+                        # TODO: Test if same length
+                        cd = {
+                            c: context_cols[c][
+                                i * gens_per_step : (i + 1) * gens_per_step
+                            ]
+                            for c in context_cols.keys()
+                        }
+                        context_dict = {"context": cd, "fragment": None}
+                        if context_smi is not None:
+                            logger.debug(
+                                f"context_smiles: {context_smi}",
+                            )
+                            # NOTE: Remove beginning [CLS] and end token [SEP]
+                            incorporate_selfie = self.tokenizer.encode(context_smi)[
+                                1:-1
+                            ]
+                            context_tensor = torch.tensor(
+                                [incorporate_selfie] * gens_per_step,
+                                dtype=torch.long,
+                                device=self.device,
+                            )
+                            context_dict["fragment"] = context_tensor
+                        context_cols = list(context_cols.keys())
+                    else:
+                        context_dict = self.get_context(
+                            context_cols, context_smi, num_examples=gens_per_step
+                        )
+                    # for k in range(num_samples):
+                    y = self.model.generate(
+                        self.tokenizer,
+                        context=context_dict["context"],
+                        fragments=context_dict["fragment"],
+                        start_smiles=start_smiles,
+                        num_gen=gens_per_step,
+                        temperature=temperature,
+                        top_k=top_k,
+                        max_length=max_new_tokens,
+                        device=self.device,
+                        cache_kv=use_kv_cache,
+                    )
+                    new_context = {k: [] for k in context_dict["context"]}
+                    for i, sample in enumerate(y):
+                        # print(sample)
+                        mol = Chem.MolFromSmiles(sample)
+                        if mol is not None:
+                            out_smiles.append(sample)
+                            for k in new_context:
+                                new_context[k].append(
+                                    context_dict["context"][k][i].unsqueeze(-1)
+                                )
+                    for k in new_context:
+                        new_context[k] = torch.concat(new_context[k], dim=0)
+                    if context is None:
+                        context = new_context
+                    else:
+                        for k in context:
+                            context[k] = torch.concat(
+                                [context[k], new_context[k]], dim=0
+                            )
+                    pbar.update(1)
+            logger.info(
+                f"Number valid generated: {len(out_smiles) / num_samples * 100} %"
+            )
+            logger.info("---------------")
+            if return_context:
+                return (out_smiles, context)
+            else:
+                return out_smiles
+    @torch.no_grad()
+    def generate_with_evaluation(
+        self,
+        context_cols: Union[List[str], None] = None,
+        context_smi: Union[str, None] = None,
+        start_smiles: Union[str, None] = None,
+        num_samples: int = 50,
+        max_new_tokens: int = 256,
+        temperature: float = 1.0,
+        top_k: Union[int, None] = None,
+        cmp_context_dict: Union[Dict[str, torch.Tensor], None] = None,
+        total_gen_steps: int = 1,
+        use_kv_cache: bool = False,
+    ):
+        out_smiles, new_context = self.generate(
+            context_cols=context_cols,
+            context_smi=context_smi,
+            start_smiles=start_smiles,
+            num_samples=num_samples,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            return_context=True,
+            total_gen_steps=total_gen_steps,
+            use_kv_cache=use_kv_cache,
+        )
+        out_dir = os.path.dirname(self.load_path)
+        if context_cols is not None:
+            if len(context_cols) == 1:
+                plot_1D_condition(
+                    context_cols,
+                    os.path.join(out_dir, "plots"),
+                    new_context,
+                    out_smiles,
+                    temperature,
+                    cmp_context_dict,
+                    context_scaler=None,
+                )
+            elif len(context_cols) == 2:
+                plot_2D_condition(
+                    context_cols,
+                    os.path.join(out_dir, "plots"),
+                    new_context,
+                    out_smiles,
+                    temperature,
+                    label=context_smi,
+                )
+            elif len(context_cols) == 3:
+                plot_3D_condition(
+                    context_cols,
+                    os.path.join(out_dir, "plots"),
+                    new_context,
+                    out_smiles,
+                    temperature,
+                )
+            else:
+                raise NotImplementedError(
+                    "Currently not implemented for len(context_col) > 3"
+                )
+        else:
+            # Unconditional Case
+            plot_unconditional(
+                out_path=os.path.join(out_dir, "plots"),
+                smiles=out_smiles,
+                temperature=temperature,
+                cmp_context_dict=cmp_context_dict,
+            )
+        if context_smi is not None:
+            pattern = r"\[\d+\*\]"
+            # replace [14*] etc
+            context_smi = re.sub(pattern, "", context_smi)
+            context_mol = Chem.MolFromSmiles(context_smi)
+            context_smarts = Chem.MolToSmarts(context_mol)
+            pattern = r"(?<!\[)([:-=#])(?!\])(?![^\[]*?\])"
+            context_smarts = re.sub(pattern, "~", context_smarts)
+            logger.info(f"context_smarts {context_smarts}")
+            out_mols = [Chem.MolFromSmiles(smi) for smi in out_smiles]
+            context_fingerprint = FingerprintMols.FingerprintMol(context_mol)
+            out_fingerprints = [FingerprintMols.FingerprintMol(fi) for fi in out_mols]
+            all_sim = []
+            all_sub = []
+            for out_fing, out_mol in zip(out_fingerprints, out_mols):
+                similarity = DataStructs.TanimotoSimilarity(
+                    context_fingerprint, out_fing
+                )
+                has_sub = out_mol.HasSubstructMatch(Chem.MolFromSmarts(context_smarts))
+                all_sub.append(has_sub)
+                all_sim.append(similarity)
+                # print(similarity,has_sub)
+            logger.info(f"Mean sim {np.mean(all_sim)}")
+            logger.info(
+                f"Has Sub: {np.count_nonzero(all_sub)} or {round(np.count_nonzero(all_sub) / len(all_sub) * 100, 4)} %"
+            )
+        return out_smiles, new_context
+if __name__ == "__main__":
+    import argparse
+    import rdkit.rdBase as rkrb
+    import rdkit.RDLogger as rkl
+    logger = rkl.logger()
+    logger.setLevel(rkl.ERROR)
+    rkrb.DisableLog("rdApp.error")
+    torch.set_num_threads(8)
+    logging.basicConfig(level=logging.INFO)
+    logger = logging.getLogger(__name__)
+    parser = argparse.ArgumentParser(
+        description="Generate SMILES strings using a trained model."
+    )
+    # parser.add_argument('--context_cols', type=str, nargs='+', default=None)
+    parser.add_argument(
+        "--context_cols",
+        type=str,
+        nargs="+",
+        default=None,
+        help="The given conditions are sampled from a fixed interval and given to the modeĺ.",
+    )
+    parser.add_argument(
+        "--context_smi",
+        type=str,
+        default=None,
+        help="This SMILES is given as context to the model and should be integrated in the generated molecules.",
+    )
+    parser.add_argument(
+        "--start_smiles",
+        type=str,
+        default=None,
+        help="This SMILES is placed at the front of each sample, from which on the generation continues.",
+    )
+    parser.add_argument(
+        "--ckpt_path",
+        type=str,
+        default=os.path.join(os.path.dirname(__file__), "out", "llama2-M-Full-RSS.pt"),
+        help="Which model should be used in the generation",
+    )
+    parser.add_argument(
+        "--num_samples",
+        type=int,
+        default=50,
+        help="Controls how many samples should be generated",
+    )
+    parser.add_argument(
+        "--num_samples_per_step",
+        type=int,
+        default=1000,
+        help="Works in conjunction with num_samples, by splitting the total into num_samples_per_step jobs. When num_samples > num_samples_per_step then it is split up into multiple seperate generation steps.",
+    )
+    parser.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=256,
+        help="Sets how many tokens should be generated from the model. We only trained with a max size of 256, but it is possible to generate longer molecules. However, these might be worse in quality.",
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=0.8,
+        help="Sets the randomness of the generation - A temperature of 0 would be deterministic and a temperature of > 1 is more random.",
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=None,
+        help="The top_k of the sampling. Per default it is None, but can be set to an integer to have a more focused generation.",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1234,
+        help="Random number generator seed, to make sampling consistent.",
+    )
+    parser.add_argument(
+        "--cmp_dataset_path",
+        type=str,
+        default=None,
+        help="A dataset in parquet or csv format to be used in the sample plots and to compute the metrics such as the novelty.",
+    )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=device,
+        help="Change the device the model and generation is run on",
+    )
+    if "cuda" in device:
+        # dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
+        dtype = "float16" if torch.cuda.is_available() else "float32"
+    else:
+        dtype = "float32"
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default=dtype,
+        help="Change the datatype of the computation. Per default it is float32 on CPU and float16 on GPU",
+    )
+    parser.add_argument(
+        "--compile",
+        type=bool,
+        default=True,
+        help="Use torch.compile to compile the model. Only works on torch>=2.0, but should make the inference faster.",
+    )
+    parser.add_argument(
+        "--quantize",
+        type=bool,
+        default=False,
+        help="(CURRENTLY NOT WORKING) Enable quantization to in8.",
+    )
+    parser.add_argument(
+        "--kv_caching",
+        action="store_true",
+        default=False,
+        help="Makes the attention mechanism linear, because the old keys and values are cached. The drawback is higher memory consumption.",
+    )
+    args = parser.parse_args()
+    logger.info("Sampling with the following parameters:")
+    logger.info(f"Checkpoint: {args.ckpt_path}")
+    logger.info(f"Context columns: {args.context_cols}")
+    logger.info(f"Context SMILES: {args.context_smi}")
+    logger.info(f"Start SMILES: {args.start_smiles}")
+    logger.info(f"Number of samples: {args.num_samples}")
+    logger.info(f"Max new tokens: {args.max_new_tokens}")
+    logger.info(f"Temperature: {args.temperature}")
+    logger.info(f"Top k: {args.top_k}")
+    logger.info(f"Seed: {args.seed}")
+    logger.info(f"Device: {args.device}")
+    logger.info(f"Data type: {args.dtype}")
+    logger.info(f"Compile: {args.compile}")
+    logger.info(f"Comparison dataset path: {args.cmp_dataset_path}")
+    logger.info(f"Quantize: {args.quantize}")
+    logger.info(f"Key Value Caching Enabled: {args.kv_caching}")
+    sampler = Sampler(
+        load_path=os.path.join(os.path.dirname(__file__), args.ckpt_path),
+        device=args.device,
+        seed=args.seed,
+        dtype=args.dtype,
+        compile=args.compile,
+        quantize=args.quantize,
+    )
+    comp_context_dict = None
+    comp_smiles = None
+    if args.cmp_dataset_path is not None:
+        df_comp = pd.read_parquet(args.cmp_dataset_path)
+        df_comp = df_comp.sample(n=2_500_000)
+        comp_context_dict = {
+            c: df_comp[c].to_numpy() for c in ["logp", "sascore", "mol_weight"]
+        }
+        comp_smiles = df_comp["smiles"]
+    measure_time = True
+    start_time = time.time()
+    smiles, context = sampler.generate_with_evaluation(
+        context_cols=args.context_cols,
+        context_smi=args.context_smi,
+        start_smiles=args.start_smiles,
+        num_samples=args.num_samples,
+        max_new_tokens=args.max_new_tokens,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        cmp_context_dict=comp_context_dict,
+        total_gen_steps=int(np.ceil(args.num_samples / args.num_samples_per_step)),
+        use_kv_cache=args.kv_caching,
+    )
+    end_time = time.time()
+    if measure_time:
+        logger.info(f"Generation took: {end_time - start_time} sec")
+    if comp_smiles is not None:
+        res_metrics = check_metrics(smiles, comp_smiles)
+        logger.info(f"Metrics: {res_metrics}")
+    logger.info("Generated Molecules:")
+    for s in smiles:
+        print(s)

tokenizer.py ADDED Viewed

	@@ -0,0 +1,404 @@

+# Requriments - transformers, tokenizers
+# Right now, the Smiles Tokenizer uses an exiesting vocab file from rxnfp that is fairly comprehensive and from the USPTO dataset.
+# The vocab may be expanded in the near future
+# Code taken from here: https://github.com/deepchem/deepchem/blob/2.4.0/deepchem/feat/smiles_tokenizer.py#L39-L282
+import collections
+import os
+import re
+import pkg_resources
+from typing import List
+from transformers import BertTokenizer
+from logging import getLogger
+logger = getLogger(__name__)
+"""
+SMI_REGEX_PATTERN: str
+    SMILES regex pattern for tokenization. Designed by Schwaller et. al.
+References
+.. [1]  Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
+        ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
+        1572-1583 DOI: 10.1021/acscentsci.9b00576
+"""
+SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""
+# add vocab_file dict
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+def get_default_tokenizer():
+    default_vocab_path = pkg_resources.resource_filename(
+        "deepchem", "feat/tests/vocab.txt"
+    )
+    return SmilesTokenizer(default_vocab_path)
+class SmilesTokenizer(BertTokenizer):
+    """
+    Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
+    implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
+    algorithm over SMILES strings using the tokenisation SMILES regex developed by Schwaller et. al.
+    Please see https://github.com/huggingface/transformers
+    and https://github.com/rxn4chemistry/rxnfp for more details.
+    Examples
+    --------
+    >>> from deepchem.feat.smiles_tokenizer import SmilesTokenizer
+    >>> current_dir = os.path.dirname(os.path.realpath(__file__))
+    >>> vocab_path = os.path.join(current_dir, 'tests/data', 'vocab.txt')
+    >>> tokenizer = SmilesTokenizer(vocab_path)
+    >>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
+    [12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]
+    References
+    ----------
+    .. [1]  Schwaller, Philippe; Probst, Daniel; Vaucher, Alain C.; Nair, Vishnu H; Kreutter, David;
+            Laino, Teodoro; et al. (2019): Mapping the Space of Chemical Reactions using Attention-Based Neural
+            Networks. ChemRxiv. Preprint. https://doi.org/10.26434/chemrxiv.9897365.v3
+    Notes
+    ----
+    This class requires huggingface's transformers and tokenizers libraries to be installed.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    def __init__(
+        self,
+        # unk_token="[UNK]",
+        # sep_token="[SEP]",
+        # pad_token="[PAD]",
+        #   cls_token="[CLS]",
+        # mask_token="[MASK]",
+        **kwargs
+    ):
+        """Constructs a SmilesTokenizer.
+        Parameters
+        ----------
+        vocab_file: str
+            Path to a SMILES character per line vocabulary file.
+            Default vocab file is found in deepchem/feat/tests/data/vocab.txt
+        """
+        vocab_file = os.path.join(os.path.dirname(__file__), "data", "vocab.txt")
+        super().__init__(vocab_file, **kwargs)
+        self.sos = "[SOS]"
+        self.eos = "[EOS]"
+        if not os.path.isfile(vocab_file):
+            raise ValueError("Can't find a vocab file at path '{}'.".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.highest_unused_index = max(
+            [i for i, v in enumerate(self.vocab.keys()) if v.startswith("[unused")]
+        )
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()]
+        )
+        self.basic_tokenizer = BasicSmilesTokenizer()
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    @property
+    def vocab_list(self):
+        return list(self.vocab.keys())
+    def _tokenize(self, text: str):
+        """
+        Tokenize a string into a list of tokens.
+        Parameters
+        ----------
+        text: str
+            Input string sequence to be tokenized.
+        """
+        split_tokens = [token for token in self.basic_tokenizer.tokenize(text)]
+        return split_tokens
+    def _convert_token_to_id(self, token):
+        """
+        Converts a token (str/unicode) in an id using the vocab.
+        Parameters
+        ----------
+        token: str
+            String token from a larger sequence to be converted to a numerical id.
+        """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """
+        Converts an index (integer) in a token (string/unicode) using the vocab.
+        Parameters
+        ----------
+        index: int
+            Integer index to be converted back to a string-based token as part of a larger sequence.
+        """
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens: List[str]):
+        """Converts a sequence of tokens (string) in a single string.
+        Parameters
+        ----------
+        tokens: List[str]
+            List of tokens for a given string sequence.
+        Returns
+        -------
+        out_string: str
+            Single string from combined tokens.
+        """
+        out_string: str = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def add_special_tokens_ids_single_sequence(self, token_ids: List[int]):
+        """
+        Adds special tokens to the a sequence for sequence classification tasks.
+        A BERT sequence has the following format: [CLS] X [SEP]
+        Parameters
+        ----------
+        token_ids: list[int]
+            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
+        """
+        return [self.cls_token_id] + token_ids + [self.sep_token_id]
+    def add_special_tokens_single_sequence(self, tokens: List[str]):
+        """
+        Adds special tokens to the a sequence for sequence classification tasks.
+        A BERT sequence has the following format: [CLS] X [SEP]
+        Parameters
+        ----------
+        tokens: List[str]
+            List of tokens for a given string sequence.
+        """
+        return [self.cls_token] + tokens + [self.sep_token]
+    def add_special_tokens_ids_sequence_pair(
+        self, token_ids_0: List[int], token_ids_1: List[int]
+    ) -> List[int]:
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        Parameters
+        ----------
+        token_ids_0: List[int]
+            List of ids for the first string sequence in the sequence pair (A).
+        token_ids_1: List[int]
+            List of tokens for the second string sequence in the sequence pair (B).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def add_padding_tokens(
+        self, token_ids: List[int], length: int, right: bool = True
+    ) -> List[int]:
+        """
+        Adds padding tokens to return a sequence of length max_length.
+        By default padding tokens are added to the right of the sequence.
+        Parameters
+        ----------
+        token_ids: list[int]
+            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
+        length: int
+        right: bool (True by default)
+        Returns
+        ----------
+        token_ids :
+            list of tokenized input ids. Can be obtained using the encode or encode_plus methods.
+        padding: int
+            Integer to be added as padding token
+        """
+        padding = [self.pad_token_id] * (length - len(token_ids))
+        if right:
+            return token_ids + padding
+        else:
+            return padding + token_ids
+    def save_vocabulary(
+        self, vocab_path: str
+    ):  # -> tuple[str]: doctest issue raised with this return type annotation
+        """
+        Save the tokenizer vocabulary to a file.
+        Parameters
+        ----------
+        vocab_path: obj: str
+            The directory in which to save the SMILES character per line vocabulary file.
+            Default vocab file is found in deepchem/feat/tests/data/vocab.txt
+        Returns
+        ----------
+        vocab_file: :obj:`Tuple(str)`:
+            Paths to the files saved.
+            typle with string to a SMILES character per line vocabulary file.
+            Default vocab file is found in deepchem/feat/tests/data/vocab.txt
+        """
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(
+                            vocab_file
+                        )
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+class BasicSmilesTokenizer(object):
+    """
+    Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al. This tokenizer is to be used
+    when a tokenizer that does not require the transformers library by HuggingFace is required.
+    Examples
+    --------
+    >>> from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
+    >>> tokenizer = BasicSmilesTokenizer()
+    >>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O"))
+    ['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']
+    References
+    ----------
+    .. [1]  Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
+            ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
+            1572-1583 DOI: 10.1021/acscentsci.9b00576
+    """
+    def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
+        """Constructs a BasicSMILESTokenizer.
+        Parameters
+        ----------
+        regex: string
+            SMILES token regex
+        """
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(self.regex_pattern)
+    def tokenize(self, text):
+        """Basic Tokenization of a SMILES."""
+        tokens = [token for token in self.regex.findall(text)]
+        return tokens
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+class BasicSmilesTokenizer(object):
+    """
+    Run basic SMILES tokenization using a regex pattern developed by Schwaller et. al. This tokenizer is to be used
+    when a tokenizer that does not require the transformers library by HuggingFace is required.
+    Examples
+    --------
+    >>> from deepchem.feat.smiles_tokenizer import BasicSmilesTokenizer
+    >>> tokenizer = BasicSmilesTokenizer()
+    >>> print(tokenizer.tokenize("CC(=O)OC1=CC=CC=C1C(=O)O"))
+    ['C', 'C', '(', '=', 'O', ')', 'O', 'C', '1', '=', 'C', 'C', '=', 'C', 'C', '=', 'C', '1', 'C', '(', '=', 'O', ')', 'O']
+    References
+    ----------
+    .. [1]  Philippe Schwaller, Teodoro Laino, Théophile Gaudin, Peter Bolgar, Christopher A. Hunter, Costas Bekas, and Alpha A. Lee
+            ACS Central Science 2019 5 (9): Molecular Transformer: A Model for Uncertainty-Calibrated Chemical Reaction Prediction
+            1572-1583 DOI: 10.1021/acscentsci.9b00576
+    """
+    def __init__(self, regex_pattern: str = SMI_REGEX_PATTERN):
+        """Constructs a BasicSMILESTokenizer.
+        Parameters
+        ----------
+        regex: string
+            SMILES token regex
+        """
+        self.regex_pattern = regex_pattern
+        self.regex = re.compile(self.regex_pattern)
+    def tokenize(self, text):
+        """Basic Tokenization of a SMILES."""
+        tokens = [token for token in self.regex.findall(text)]
+        return tokens
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+if __name__ == "__main__":
+    current_dir = os.path.dirname(os.path.realpath(__file__))
+    vocab_path = os.path.join(current_dir, "tests/data", "vocab.txt")
+    tokenizer = SmilesTokenizer()
+    tokens = tokenizer.encode(
+        "CN1CC[C@]23[C@@H]4[C@H]1CC5=C2C(=C(C=C5)O)O[C@H]3[C@H](C=C4)O"
+    )
+    print([tokenizer._convert_id_to_token(t) for t in tokens])
+    enc = tokenizer.encode("CC=O")
+    print(enc)
+    print(tokenizer.decode(enc, skip_special_tokens=True).replace(" ", ""))

torch2-env.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: torch2-llamol
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.8
+  - torchaudio
+  - pytorch
+  - torchvision
+  - pytorch-cuda
+  - rdkit
+  - ca-certificates
+  - certifi
+  - openssl
+  - openbabel
+  - ipykernel
+pip:
+  - tqdm
+  - transformers
+  - pandas
+  - matplotlib
+  - seaborn
+  - hydra-core
+  - swifter
+  - pyarrow
+  - ipywidgets
+  - dask

train.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from trainer import (
+    IOConfig,
+    LoaderConfig,
+    Trainer,
+    TrainerArgs,
+    ModelArgs,
+    ContextArgs,
+    OptimizerConfig,
+)
+from torch.distributed.elastic.multiprocessing.errors import record
+import hydra
+from omegaconf import DictConfig, OmegaConf
+import logging
+import sys
+import os
+import torch
+def setup_logger(run_name: str, log_path: str):
+    ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
+    if ddp:
+        ddp_rank = int(os.environ["RANK"])
+        ddp_local_rank = int(os.environ["LOCAL_RANK"])
+        ddp_world_size = int(os.environ["WORLD_SIZE"])
+        formatter = logging.Formatter(
+            f"[%(levelname)s] DDP[{ddp_rank},{ddp_local_rank},{ddp_world_size}] %(asctime)s - [%(filename)s:%(lineno)d]: %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    else:
+        formatter = logging.Formatter(
+            r"[%(levelname)s] %(asctime)s - [%(filename)s:%(lineno)d]: %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S",
+        )
+    stream_handler = logging.StreamHandler(sys.stdout)
+    stream_handler.setFormatter(formatter)
+    os.makedirs(log_path, exist_ok=True)
+    file_handler = logging.FileHandler(os.path.join(log_path, f"train_{run_name}.log"))
+    file_handler.setFormatter(formatter)
+    logging.basicConfig(level=logging.INFO, handlers=[stream_handler, file_handler])
+    return logging.getLogger()
+@record
+@hydra.main(version_base=None, config_path="config", config_name="config")
+def main(cfg: DictConfig) -> None:
+    logger = setup_logger(
+        cfg.get("run_name", "default"), cfg.get("io", {"out_dir": "out"})["out_dir"]
+    )
+    logger.info("Using config")
+    logger.info(cfg)
+    cfg = cfg["train"]
+    io_conf = IOConfig(**cfg.get("io", {}))
+    loader_conf = LoaderConfig(**cfg.get("loader", {}))
+    model_args = ModelArgs(**cfg.get("model", {}))
+    ctx_args = ContextArgs(**cfg.get("context", {}))
+    optmizer_conf = OptimizerConfig(**cfg.get("optimizer", {}))
+    train_args = TrainerArgs(
+        io_conf=io_conf,
+        loader_conf=loader_conf,
+        model_conf=model_args,
+        context_conf=ctx_args,
+        optimizer_conf=optmizer_conf,
+        run_name=cfg.get("label", "train_run"),
+    )
+    # When training on cpu / testing to not max out all cpu cores
+    torch.set_num_threads(8)
+    trainer = Trainer(
+        train_args=train_args,
+        dtype=cfg.get("dtype", "float16"),
+        compile=cfg.get("compile", False),
+    )
+    should_profile = cfg.get("profile", False)
+    if should_profile:
+        with torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ]
+        ) as p:
+            trainer.train()
+        print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+    else:
+        trainer.train()
+if __name__ == "__main__":
+    # python train.py train=llama2-M-Full train.model.dim=1024
+    main()

trainLLamaMol.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+#SBATCH --mem=32gb                    # Total memory limit
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=2
+#SBATCH --partition=<YOUR PARTITION>
+#SBATCH --gres=gpu:a100:1
+#SBATCH --time=2-00:00:00               # Time limit 2-hrs:min:sec days
+export CUDA_VISIBLE_DEVICES=0
+# TODO: Change FULL_PATH_TO_CONDA to the binary where the conda folder is: see https://github.com/conda/conda/issues/8536
+conda activate FULL_PATH_TO_CONDA/torch2-llamol
+module load CUDA/11.7.0
+module load GCC/7.1.0-2.28
+cd ~/llama2-mol
+srun python train.py train=llama2-M-Full-RSS > "train_runs/run_$SLURM_JOB_ID.out"

trainLLamaMolDDPSingleNode.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+#SBATCH --mem=32gb                    # Total memory limit
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=<HOW MANY GPUS>
+#SBATCH --cpus-per-task=2
+#SBATCH --partition=<YOUR PARTITION>
+#SBATCH --gres=gpu:a100:<HOW MANY GPUS>
+#SBATCH --time=2-00:00:00               # Time limit 2-hrs:min:sec days
+export WORLD_SIZE=2
+export OMP_NUM_THREADS=8
+### get the first node name as master address - customized for vgg slurm
+### e.g. master(gnodee[2-5],gnoded1) == gnodee2
+echo "NODELIST="${SLURM_NODELIST}
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+PORT=54357
+export MASTER_ADDR="$master_addr:$PORT"
+# TODO: Change FULL_PATH_TO_CONDA to the binary where the conda folder is: see https://github.com/conda/conda/issues/8536
+conda activate FULL_PATH_TO_CONDA/torch2-llamol
+module load CUDA/11.7.0
+module load GCC/8.3.0
+# TODO: Change this to the folder you cloned the repo in
+cd ~/llamol
+srun torchrun --standalone --max_restarts=3  --nnodes=1 --nproc_per_node=2 --rdzv-id=$SLURM_JOB_ID --rdzv-backend=c10d  --rdzv-endpoint="$master_addr:$PORT" train.py train=llama2-M-Full > "train_runs/run_$SLURM_JOB_ID.out"

trainer.py ADDED Viewed

	@@ -0,0 +1,513 @@

+from dataclasses import dataclass
+from typing import Any, Dict, Optional, Tuple, List, Union
+from fragment_creator import fragment_creator_factory
+from model import ContextArgs, ModelArgs
+from tqdm import tqdm
+import math
+import os
+import time
+from contextlib import nullcontext
+from datetime import datetime
+from functools import partial
+import torch
+import numpy as np
+from model import ContextArgs, Transformer, ModelArgs
+from torch.distributed import destroy_process_group, init_process_group
+from torch.nn.parallel import DistributedDataParallel as DDP
+from preprocess_dataset import SmilesTask
+from tokenizer import SmilesTokenizer
+import logging
+logger = logging.getLogger(__name__)
+@dataclass
+class IOConfig:
+    # I/O
+    out_dir: str = "out"
+    eval_interval: int = 500
+    log_interval: int = 10
+    eval_iters: int = 25
+    eval_only: bool = False  # if True, script exits right after the first eval
+    always_save_checkpoint: bool = (
+        False  # if True, always save a checkpoint after each eval
+    )
+    init_from: str = "scratch"  # 'scratch' or 'resume'
+    resume_when_snapshot_available: bool = True
+@dataclass
+class LoaderConfig:
+    # data
+    batch_size: int = (
+        384  # if gradient_accumulation_steps > 1, this is the micro-batch size
+    )
+    max_seq_len: int = 768
+    dataset: str = "smiles"
+    processed_dataset_ckpt: str = "processed_dataset_None.pkl"
+    fragment_creator: Union[str, None] = None
+# dim = 256
+# n_layers = 8
+# n_heads = 8
+# multiple_of = 128
+# dropout = 0.1
+@dataclass
+class OptimizerConfig:
+    # adamw optimizer
+    gradient_accumulation_steps: int = 4  # used to simulate larger batch sizes
+    learning_rate: float = 1e-4  # max learning rate
+    max_iters: int = 100000  # total number of training iterations
+    weight_decay: float = 1e-1
+    beta1: float = 0.9
+    beta2: float = 0.95
+    grad_clip: float = 1.0  # clip gradients at this value, or disable if == 0.0
+    # learning rate decay settings
+    decay_lr: bool = True  # whether to decay the learning rate
+    warmup_iters: int = 1000  # how many steps to warm up for
+    lr_decay_iters: int = 100000  # should be ~= max_iters per Chinchilla
+    min_lr: float = (
+        0.0  # minimum learning rate, should be ~= learning_rate/10 per Chinchilla
+    )
+@dataclass
+class TrainerArgs:
+    # Input / Output
+    io_conf: IOConfig
+    # Loader Configs
+    loader_conf: LoaderConfig
+    # Transformer Args
+    model_conf: ModelArgs
+    context_conf: ContextArgs
+    # Optimizer
+    optimizer_conf: OptimizerConfig
+    run_name: str
+class Trainer:
+    def __init__(
+        self, train_args: TrainerArgs, dtype: str = "float16", compile: bool = False
+    ) -> None:
+        self.train_conf = train_args
+        self.dtype = dtype
+        self.compile = compile
+        # system
+        self.run_name = train_args.run_name
+        self.device = (
+            "cuda:0" if torch.cuda.is_available() else "cpu"
+        )  # "cuda"  # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1' etc., or try 'mps' on macbooks
+        self.CKPT_PT = f"{self.run_name}.pt"
+        self.SNAPSHOT_PT = f"snapshot_{self.run_name}.pt"
+    def _init_ddp_if_possible(self):
+        # various inits, derived attributes, I/O setup
+        self.ddp = int(os.environ.get("RANK", -1)) != -1  # is this a ddp run?
+        if self.ddp:
+            logger.info(f"Using ddp!")
+            init_process_group(backend="nccl")
+            self.ddp_rank = int(os.environ["RANK"])
+            self.ddp_local_rank = int(os.environ["LOCAL_RANK"])
+            self.ddp_world_size = int(os.environ["WORLD_SIZE"])
+            logger.info(f"{self.ddp_rank}, {self.ddp_local_rank},{self.ddp_world_size}")
+            self.device = f"cuda:{self.ddp_local_rank}"
+            torch.cuda.set_device(self.device)
+            self.master_process = (
+                self.ddp_rank == 0
+            )  # this process will do logging, checkpointing etc.
+            logger.info(f"Is master process {self.device}? {self.master_process}")
+            self.seed_offset = self.ddp_rank  # each process gets a different seed
+            # world_size number of processes will be training simultaneously, so we can scale
+            # down the desired gradient accumulation iterations per process proportionally
+            assert (
+                self.train_conf.optimizer_conf.gradient_accumulation_steps
+                % self.ddp_world_size
+                == 0
+            )
+            self.train_conf.optimizer_conf.gradient_accumulation_steps //= (
+                self.ddp_world_size
+            )
+        else:
+            # if not ddp, we are running on a single gpu, and one process
+            self.master_process = True
+            self.seed_offset = 0
+            self.ddp_world_size = 1
+    def _init_train(self):
+        self.tokens_per_iter = (
+            self.train_conf.optimizer_conf.gradient_accumulation_steps
+            * self.ddp_world_size
+            * self.train_conf.loader_conf.batch_size
+            * self.train_conf.loader_conf.max_seq_len
+        )
+        if self.master_process:
+            logger.info(f"tokens per iteration will be: {self.tokens_per_iter:,}")
+            logger.info(
+                f"breaks down as: {self.train_conf.optimizer_conf.gradient_accumulation_steps} grad accum steps * {self.ddp_world_size} processes * {self.train_conf.loader_conf.batch_size} batch size * {self.train_conf.loader_conf.max_seq_len } max seq len"
+            )
+        if self.master_process:
+            os.makedirs(self.train_conf.io_conf.out_dir, exist_ok=True)
+        torch.manual_seed(1337 + self.seed_offset)
+        np.random.seed(1337 + self.seed_offset)
+        torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
+        torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
+        self.device_type = (
+            "cuda" if "cuda" in self.device else "cpu"
+        )  # for later use in torch.autocast
+        # note: float16 data type will automatically use a GradScaler
+        ptdtype = {
+            "float32": torch.float32,
+            "bfloat16": torch.bfloat16,
+            "float16": torch.float16,
+        }[self.dtype]
+        self.ctx = (
+            nullcontext()
+            if self.device_type == "cpu"
+            else torch.amp.autocast(device_type=self.device_type, dtype=ptdtype)
+        )
+        # task-specific setup
+        task = {"smiles": SmilesTask}[self.train_conf.loader_conf.dataset]
+        self.iter_batches = partial(
+            task.iter_batches,
+            batch_size=self.train_conf.loader_conf.batch_size,
+            device=self.device,
+            context_keys=self.train_conf.context_conf.context_keys,
+            num_workers=0,
+            dataset=self.train_conf.loader_conf.processed_dataset_ckpt,
+            fragment_creator=fragment_creator_factory(
+                self.train_conf.loader_conf.fragment_creator
+            ),
+        )
+        # init these up here, can override if init_from='resume' (i.e. from a checkpoint)
+        self.iter_num = 0
+        self.best_val_loss = 1e9
+        self.epoch = 1
+        self.tokenizer = SmilesTokenizer()
+        has_resumed = False
+        if (
+            self.train_conf.io_conf.init_from == "resume"
+            or self.train_conf.io_conf.resume_when_snapshot_available
+        ):
+            snapshot_path = os.path.join(
+                self.train_conf.io_conf.out_dir, self.SNAPSHOT_PT
+            )
+            if os.path.exists(snapshot_path):
+                has_resumed = True
+                logger.info(f"Resuming training from {self.train_conf.io_conf.out_dir}")
+                # resume training from a checkpoint.
+                ckpt_path = os.path.join(self.train_conf.io_conf.out_dir, self.CKPT_PT)
+                self.model = Transformer.load(ckpt_path, device=self.device)
+                snapshot = torch.load(snapshot_path, map_location=self.device)
+                self.iter_num = snapshot["iter_num"]
+                self.best_val_loss = snapshot["best_val_loss"]
+                self.epoch = snapshot["epoch"]
+        if self.train_conf.io_conf.init_from == "scratch" and not has_resumed:
+            # init a new model from scratch
+            logger.info("Initializing a new model from scratch")
+            logger.info(self.device)
+            model_conf = self.train_conf.model_conf
+            model_conf.vocab_size = self.tokenizer.vocab_size
+            self.model = Transformer(model_conf, self.train_conf.context_conf).to(
+                self.device
+            )
+            logger.info(
+                f"Number of params: {self.model.getNumberParams()} Number Trainable Params: {self.model.getNumberTrainableParams()}"
+            )
+        # else:
+        #     raise ValueError(
+        #         f"Could not find option: {self.train_conf.io_conf.init_from}. Use either 'scratch' or 'resume'"
+        #     )
+        self.model = self.model.to(self.device)
+        # initialize a GradScaler. If enabled=False scaler is a no-op
+        self.scaler = torch.cuda.amp.GradScaler(enabled=(self.dtype == "float16"))
+        # optimizer
+        self.optimizer = self.model.configure_optimizers(
+            self.train_conf.optimizer_conf.weight_decay,
+            self.train_conf.optimizer_conf.learning_rate,
+            (
+                self.train_conf.optimizer_conf.beta1,
+                self.train_conf.optimizer_conf.beta2,
+            ),
+            self.device_type,
+        )
+        if (
+            self.train_conf.io_conf.init_from == "resume"
+            and "optimizer_state" in snapshot
+        ):
+            logger.info("Loading optimizer state from snapshot")
+            self.optimizer.load_state_dict(snapshot["optimizer_state"])
+        snapshot = None  # free up memory
+        # compile the model
+        if self.compile:
+            logger.info("compiling the model... (takes a ~minute)")
+            self.unoptimized_model = self.model
+            # NOTE: This is REALLY REALLY slow in our case, as the shapes are different in each epoch.
+            # So it recompiles every batch ._.
+            self.model = torch.compile(
+                self.model, dynamic=False
+            )  # requires PyTorch 2.0
+        # wrap model into DDP container
+        if self.ddp:
+            # Ignore the `freqs_cis` buffer so that DDP does not broadcast it at
+            # construction time since NCCL does not support `ComplexFloat`
+            prefix = "_orig_mod." if compile else ""
+            self.model._ddp_params_and_buffers_to_ignore = {prefix + "freqs_cis"}
+            self.model = DDP(self.model, device_ids=[self.ddp_local_rank])
+    # helps estimate an arbitrarily accurate loss over either split using many batches
+    @torch.no_grad()
+    def estimate_loss(self):
+        out = {}
+        self.model.eval()
+        for split in ["train", "val"]:
+            batch_iter = self.iter_batches(split)
+            losses = torch.zeros(self.train_conf.io_conf.eval_iters)  # keep on CPU
+            for k in tqdm(
+                range(self.train_conf.io_conf.eval_iters),
+                total=self.train_conf.io_conf.eval_iters,
+                desc="Eval",
+            ):
+                try:
+                    X = next(batch_iter)
+                    with self.ctx:
+                        # logger.info(model)
+                        # logger.info(X["src"].device)
+                        logits = self.model(
+                            X["src"],
+                            targets=X["tgt"],
+                            context=X["context"],
+                            fragment=X["fragment"],
+                        )
+                        loss = self.raw_model.last_loss
+                    losses[k] = loss.item()
+                except StopIteration:
+                    logger.info("Early Eval Stop")
+            out[split] = losses.mean()
+        self.model.train()
+        return out
+    # learning rate decay scheduler (cosine with warmup)
+    def get_lr(self, it: int):
+        warmup_iters = self.train_conf.optimizer_conf.warmup_iters
+        learning_rate = self.train_conf.optimizer_conf.learning_rate
+        lr_decay_iters = self.train_conf.optimizer_conf.lr_decay_iters
+        min_lr = self.train_conf.optimizer_conf.min_lr
+        # 1) linear warmup for warmup_iters steps
+        if it < warmup_iters:
+            return learning_rate * it / warmup_iters
+        # 2) if it > lr_decay_iters, return min learning rate
+        if it > lr_decay_iters:
+            return min_lr
+        # 3) in between, use cosine decay down to min learning rate
+        decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
+        assert 0 <= decay_ratio <= 1
+        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
+        return min_lr + coeff * (learning_rate - min_lr)
+    def train(self):
+        self._init_ddp_if_possible()
+        self._init_train()
+        # training loop
+        train_batch_iter = self.iter_batches("train")
+        X = next(train_batch_iter)  # fetch the very first batch
+        t0 = time.time()
+        local_iter_num = 0  # number of iterations in the lifetime of this process
+        self.raw_model = (
+            self.model.module if self.ddp else self.model
+        )  # unwrap DDP container if needed
+        running_mfu = -1.0
+        gradient_accumulation_steps = (
+            self.train_conf.optimizer_conf.gradient_accumulation_steps
+        )
+        while True:
+            # determine and set the learning rate for this iteration
+            lr = (
+                self.get_lr(self.iter_num)
+                if self.train_conf.optimizer_conf.decay_lr
+                else self.train_conf.optimizer_conf.learning_rate
+            )
+            for param_group in self.optimizer.param_groups:
+                param_group["lr"] = lr
+            # evaluate the loss on train/val sets and write checkpoints
+            if (
+                self.iter_num % self.train_conf.io_conf.eval_interval == 0
+                and self.master_process
+                and self.iter_num != 0
+            ):
+                logger.info(
+                    f"Estimating loss for master_process({self.master_process}) on iter {self.iter_num}"
+                )
+                losses = self.estimate_loss()
+                logger.info(
+                    f"step {self.iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}"
+                )
+                log_dict = {
+                    "iter": self.iter_num,
+                    "tokens": self.iter_num * self.tokens_per_iter,
+                    "loss/train": losses["train"],
+                    "loss/val": losses["val"],
+                    "lr": lr,
+                    "mfu": running_mfu * 100,  # convert to percentage
+                }
+                logger.info(f"{log_dict}")
+                if (
+                    losses["val"] < self.best_val_loss
+                    or self.train_conf.io_conf.always_save_checkpoint
+                ):
+                    self.best_val_loss = losses["val"]
+                    if self.iter_num > 0:
+                        logger.info(
+                            f"saving checkpoint to {self.train_conf.io_conf.out_dir}"
+                        )
+                        self.raw_model.save(
+                            os.path.join(self.train_conf.io_conf.out_dir, self.CKPT_PT)
+                        )
+                        torch.save(
+                            {
+                                "iter_num": self.iter_num,
+                                "epoch": self.epoch,
+                                "best_val_loss": self.best_val_loss,
+                                "optimizer_state": self.optimizer.state_dict(),
+                            },
+                            os.path.join(
+                                self.train_conf.io_conf.out_dir, self.SNAPSHOT_PT
+                            ),
+                        )
+            if self.iter_num == 0 and self.train_conf.io_conf.eval_only:
+                break
+            # forward backward update, with optional gradient accumulation to simulate larger batch size
+            # and using the GradScaler if data type is float16
+            for micro_step in range(gradient_accumulation_steps):
+                if self.ddp:
+                    # in DDP training we only need to sync gradients at the last micro step.
+                    # the official way to do this is with model.no_sync() context manager, but
+                    # I really dislike that this bloats the code and forces us to repeat code
+                    # looking at the source of that context manager, it just toggles this variable
+                    self.model.require_backward_grad_sync = (
+                        micro_step == gradient_accumulation_steps - 1
+                    )
+                with self.ctx:
+                    context = X["context"]
+                    fragment = X["fragment"]
+                    # SCL (Stochastic context learning) algorithm
+                    if np.random.random() < 0.15 or fragment is None:
+                        fragment = None
+                    # NOTE: random delete one context or more context columns
+                    current_context_keys = list(context.keys())
+                    for k in current_context_keys:
+                        if np.random.random() < 0.15:
+                            del context[k]
+                    logits = self.model(
+                        X["src"], targets=X["tgt"], context=context, fragment=fragment
+                    )
+                    loss = self.raw_model.last_loss
+                    loss = loss / gradient_accumulation_steps
+                # immediately async prefetch next batch while model is doing the forward pass on the GPU
+                try:
+                    X = next(train_batch_iter)
+                except StopIteration:
+                    # StopIteration is thrown if dataset ends
+                    # reinitialize data loader
+                    logger.info(f"Done Epoch {self.epoch}")
+                    train_batch_iter = self.iter_batches("train")
+                    X = next(train_batch_iter)
+                    self.epoch += 1
+                # backward pass, with gradient scaling if training in fp16
+                self.scaler.scale(loss).backward()
+                # logger.info(loss)
+            # clip the gradient
+            if self.train_conf.optimizer_conf.grad_clip != 0.0:
+                self.scaler.unscale_(self.optimizer)
+                torch.nn.utils.clip_grad_norm_(
+                    self.model.parameters(), self.train_conf.optimizer_conf.grad_clip
+                )
+            # step the optimizer and scaler if training in fp16
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+            # flush the gradients as soon as we can, no need for this memory anymore
+            self.optimizer.zero_grad(set_to_none=True)
+            # timing and logging
+            t1 = time.time()
+            dt = t1 - t0
+            t0 = t1
+            if (
+                self.iter_num % self.train_conf.io_conf.log_interval == 0
+                and self.master_process
+            ):
+                # get loss as float, scale up due to the divide above. note: this is a CPU-GPU sync point
+                lossf = loss.item() * gradient_accumulation_steps
+                if local_iter_num >= 5:  # let the training loop settle a bit
+                    mfu = self.raw_model.estimate_mfu(
+                        self.train_conf.loader_conf.batch_size
+                        * gradient_accumulation_steps,
+                        dt,
+                    )
+                    running_mfu = (
+                        mfu if running_mfu == -1.0 else 0.9 * running_mfu + 0.1 * mfu
+                    )
+                logger.info(
+                    f"{self.iter_num} | loss {lossf:.4f} | lr {lr:e} | {dt*1000:.2f}ms | mfu {running_mfu*100:.2f}%"
+                )
+            self.iter_num += 1
+            local_iter_num += 1
+            # termination conditions
+            if self.iter_num > self.train_conf.optimizer_conf.max_iters:
+                logger.info("Done with training iters!")
+                break
+        if self.ddp:
+            destroy_process_group()
+if __name__ == "__main__":
+    pass