{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "4feb6490", "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem\n", "import pandas as pd\n", "from tqdm import tqdm\n", "from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator, GetAtomPairGenerator, GetTopologicalTorsionGenerator\n", "from rdkit.Chem import MACCSkeys\n", "import numpy as np\n", "from rdkit.Chem import Descriptors, rdmolops\n", "from rdkit.Chem.Descriptors import MolWt, MolLogP\n", "from rdkit.Chem.rdMolDescriptors import CalcTPSA, CalcNumRotatableBonds\n", "import networkx as nx" ] }, { "cell_type": "code", "execution_count": null, "id": "59d77a9b", "metadata": {}, "outputs": [], "source": [ "BASE_PATH = '/'\n", "TARGETS = ['Density']\n", "\n", "def get_canonical_smiles(smiles):\n", " try:\n", " mol = Chem.MolFromSmiles(smiles)\n", " if mol:\n", " return Chem.MolToSmiles(mol, canonical=True)\n", " except:\n", " pass\n", " return smiles\n", "\n", "print(\"šŸ“‚ Loading competition data...\")\n", "train = pd.read_csv(BASE_PATH + 'train.csv')\n", "test = pd.read_csv(BASE_PATH + 'test.csv')\n", "\n", "print(f\" Training samples: {len(train)}\")\n", "print(f\" Test samples: {len(test)}\")\n", "\n", "def clean_and_validate_smiles(smiles):\n", " if not isinstance(smiles, str) or len(smiles) == 0:\n", " return None\n", " \n", " bad_patterns = [\n", " '[R]', '[R1]', '[R2]', '[R3]', '[R4]', '[R5]', \n", " \"[R']\", '[R\"]', 'R1', 'R2', 'R3', 'R4', 'R5',\n", " '([R])', '([R1])', '([R2])', \n", " ]\n", " \n", " for pattern in bad_patterns:\n", " if pattern in smiles:\n", " return None\n", " \n", " if '][' in smiles and any(x in smiles for x in ['[R', 'R]']):\n", " return None\n", " \n", " try:\n", " mol = Chem.MolFromSmiles(smiles)\n", " if mol is not None:\n", " return Chem.MolToSmiles(mol, canonical=True)\n", " else:\n", " return None\n", " except:\n", " return None\n", " \n", " return smiles\n", "\n", "print(\"šŸ”„ Cleaning and validating SMILES...\")\n", "train['SMILES'] = train['SMILES'].apply(clean_and_validate_smiles)\n", "test['SMILES'] = test['SMILES'].apply(clean_and_validate_smiles)\n", "\n", "invalid_train = train['SMILES'].isnull().sum()\n", "invalid_test = test['SMILES'].isnull().sum()\n", "\n", "print(f\" Removed {invalid_train} invalid SMILES from training data\")\n", "print(f\" Removed {invalid_test} invalid SMILES from test data\")\n", "\n", "train = train[train['SMILES'].notnull()].reset_index(drop=True)\n", "test = test[test['SMILES'].notnull()].reset_index(drop=True)\n", "\n", "print(f\" Final training samples: {len(train)}\")\n", "print(f\" Final test samples: {len(test)}\")\n", "\n", "def add_extra_data_clean(df_train, df_extra, target):\n", " n_samples_before = len(df_train[df_train[target].notnull()])\n", " \n", " print(f\" Processing {len(df_extra)} {target} samples...\")\n", " \n", " df_extra['SMILES'] = df_extra['SMILES'].apply(clean_and_validate_smiles)\n", " \n", " before_filter = len(df_extra)\n", " df_extra = df_extra[df_extra['SMILES'].notnull()]\n", " df_extra = df_extra.dropna(subset=[target])\n", " after_filter = len(df_extra)\n", " \n", " print(f\" Kept {after_filter}/{before_filter} valid samples\")\n", " \n", " if len(df_extra) == 0:\n", " print(f\" No valid data remaining for {target}\")\n", " return df_train\n", " \n", " df_extra = df_extra.groupby('SMILES', as_index=False)[target].mean()\n", " \n", " cross_smiles = set(df_extra['SMILES']) & set(df_train['SMILES'])\n", " unique_smiles_extra = set(df_extra['SMILES']) - set(df_train['SMILES'])\n", "\n", " filled_count = 0\n", " for smile in df_train[df_train[target].isnull()]['SMILES'].tolist():\n", " if smile in cross_smiles:\n", " df_train.loc[df_train['SMILES']==smile, target] = \\\n", " df_extra[df_extra['SMILES']==smile][target].values[0]\n", " filled_count += 1\n", " \n", " extra_to_add = df_extra[df_extra['SMILES'].isin(unique_smiles_extra)].copy()\n", " if len(extra_to_add) > 0:\n", " for col in TARGETS:\n", " if col not in extra_to_add.columns:\n", " extra_to_add[col] = np.nan\n", " \n", " extra_to_add = extra_to_add[['SMILES'] + TARGETS]\n", " df_train = pd.concat([df_train, extra_to_add], axis=0, ignore_index=True)\n", "\n", " n_samples_after = len(df_train[df_train[target].notnull()])\n", " print(f' {target}: +{n_samples_after-n_samples_before} samples, +{len(unique_smiles_extra)} unique SMILES')\n", " return df_train\n", "\n", "print(\"\\nšŸ“‚ Loading external datasets...\")\n", "\n", "external_datasets = []\n", "\n", "def safe_load_dataset(path, target, processor_func, description):\n", " try:\n", " if path.endswith('.xlsx'):\n", " data = pd.read_excel(path)\n", " else:\n", " data = pd.read_csv(path)\n", " \n", " data = processor_func(data)\n", " external_datasets.append((target, data))\n", " print(f\" āœ… {description}: {len(data)} samples\")\n", " return True\n", " except Exception as e:\n", " print(f\" āš ļø {description} failed: {str(e)[:100]}\")\n", " return False\n", "\n", "# Link: https://github.com/Duke-MatSci/ChemProps \n", "safe_load_dataset(\n", " './density_data.xlsx',\n", " 'Density',\n", " lambda df: df.rename(columns={'density(g/cm3)': 'Density'})[['SMILES', 'Density']]\n", " .query('SMILES.notnull() and Density.notnull() and Density != \"nylon\"')\n", " .assign(Density=lambda x: x['Density'].astype(float) - 0.118),\n", " 'Density data'\n", ")\n", "\n", "# Link: https://www.kaggle.com/datasets/oleggromov/polymer-tg-density-excerpt\n", "safe_load_dataset(\n", " './tg_density.csv',\n", " 'Density',\n", " lambda df: df[['SMILES', 'Density']] if 'Density' in df.columns else df,\n", " 'Density data polymer-tg-density-excerpt'\n", ")\n", "\n", "print(\"\\nšŸ”„ Integrating external data...\")\n", "train_extended = train[['SMILES'] + TARGETS].copy()\n", "\n", "for target, dataset in external_datasets:\n", " print(f\" Processing {target} data...\")\n", " train_extended = add_extra_data_clean(train_extended, dataset, target)\n", "\n", "print(f\"\\nšŸ“Š Final training data:\")\n", "print(f\" Original samples: {len(train)}\")\n", "print(f\" Extended samples: {len(train_extended)}\")\n", "print(f\" Gain: +{len(train_extended) - len(train)} samples\")\n", "\n", "for target in TARGETS:\n", " count = train_extended[target].notna().sum()\n", " original_count = train[target].notna().sum() if target in train.columns else 0\n", " gain = count - original_count\n", " print(f\" {target}: {count:,} samples (+{gain})\")\n", "\n", "train_df = train_extended\n", "print(f\"\\nāœ… Data integration complete with clean SMILES!\")" ] }, { "cell_type": "code", "execution_count": null, "id": "55763e3f", "metadata": {}, "outputs": [], "source": [ "def augment_smiles_dataset(train_df, label, num_augments=1):\n", " augmented_df = {\n", " 'SMILES': [],\n", " }\n", " augmented_df[label] = []\n", " failed = 0\n", " \n", " # \n", " for idx, row in tqdm(train_df.iterrows(), desc=\"šŸ”¬ Data Augmentation\", total=len(train_df)):\n", " smiles = row['SMILES']\n", " target = row[label]\n", " \n", " # \n", " mol = Chem.MolFromSmiles(smiles)\n", " if mol is None:\n", " failed += 1\n", " continue\n", " augmented_df['SMILES'].append(smiles)\n", " augmented_df[label].append(target)\n", " \n", " # \n", " for _ in range(num_augments):\n", " rand_smiles = Chem.MolToSmiles(mol, doRandom=True)\n", " augmented_df['SMILES'].append(rand_smiles)\n", " augmented_df[label].append(target)\n", " \n", " print(f\"No. of Failed SMILES: {failed}\")\n", " return pd.DataFrame(augmented_df)" ] }, { "cell_type": "code", "execution_count": null, "id": "4a9b06a3", "metadata": {}, "outputs": [], "source": [ "train_df = augment_smiles_dataset(train_df, 'Density')\n", "train_df = train_df[train_df['Density'].notnull()].drop(columns=[]).reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "86b804ed", "metadata": {}, "outputs": [], "source": [ "required_descriptors = {'graph_diameter','num_cycles','avg_shortest_path','MolWt', 'LogP', 'TPSA', 'RotatableBonds', 'NumAtoms', 'SMILES'}\n", "\n", "filters = {\n", " 'Tg': list(set(['deg_mean', 'FractionCSP3', 'num_cycles', 'RingCount', 'HallKierAlpha', 'SMR_VSA7', 'BertzCT', 'ring_size_6', 'fr_benzene', 'NumAromaticCarbocycles', 'NumAromaticRings', 'SlogP_VSA6', 'SlogP_VSA1', 'betw_mean', 'VSA_EState6', 'BalabanJ', 'Chi4n', 'FP_446', 'PEOE_VSA14', 'Chi3n', 'AvgIpc', 'FP_489', 'Chi1', 'HeavyAtomCount', 'NumHeterocycles', 'FP_485', 'fr_bicyclic', 'SMR_VSA10', 'FP_537', 'VSA_EState2', 'FP_539', 'FP_529', 'HeavyAtomMolWt', 'LabuteASA', 'ring_size_5', 'FP_505', 'NumAmideBonds', 'MolMR', 'FP_80', 'FP_195', 'FP_310', 'fr_amide', 'FP_509', 'FP_378', 'ExactMolWt', 'MolWt', 'FP_211', 'Chi2n', 'FP_266', 'FP_379', 'FP_207', 'FP_504', 'FP_203', 'NumAtoms', 'FP_199', 'FP_519', 'FP_123', 'FP_278', 'FpDensityMorgan1', 'FP_119', 'fr_imide', 'FP_279', 'FP_223', 'betw_std', 'FP_231', 'FP_219', 'FP_251', 'NumValenceElectrons', 'FP_480', 'Chi0n', 'FP_517', 'FP_255', 'Chi0', 'FP_522', 'FP_528', 'FP_526', 'FpDensityMorgan2', 'FP_354', 'Chi1n', 'FP_459', 'FP_547', 'FP_476', 'Chi0v', 'FP_210', 'FP_516', 'FP_382', 'FP_215', 'FP_243', 'FP_521', 'FP_227', 'NumAliphaticHeterocycles', 'FP_469', 'FP_467', 'FP_342', 'FP_549', 'FP_357', 'FP_494', 'FP_194', 'FP_546', 'FP_302']).union(required_descriptors)),\n", "\n", " 'FFV': list(set(['MolLogP', 'LogP', 'Chi3v', 'Chi2v', 'Chi4v', 'Chi4n', 'Chi3n', 'VSA_EState6', 'SMR_VSA7', 'Chi1v', 'Chi2n', 'MolMR', 'Chi1n', 'Chi0v', 'SlogP_VSA6', 'BertzCT', 'PEOE_VSA14', 'Chi0n', 'LabuteASA', 'EState_VSA8', 'BalabanJ', 'Ipc', 'Chi1', 'deg_mean', 'VSA_EState8', 'MolWt', 'ExactMolWt', 'SMR_VSA9', 'HeavyAtomMolWt', 'Chi0', 'SMR_VSA6', 'EState_VSA5', 'FpDensityMorgan3', 'Kappa1', 'AvgIpc', 'FpDensityMorgan2', 'SlogP_VSA8', 'HallKierAlpha', 'FP_39', 'avg_shortest_path', 'SMR_VSA1', 'SlogP_VSA5', 'betw_mean', 'TPSA', 'FpDensityMorgan1', 'lap_eig_6', 'qed', 'lap_eig_7', 'lap_eig_8', 'RingCount', 'NumValenceElectrons', 'NumAromaticRings', 'lap_eig_5', 'num_cycles', 'EState_VSA7', 'Kappa2', 'NumAtoms', 'ring_size_6', 'betw_std', 'lap_eig_4', 'lap_eig_3', 'HeavyAtomCount', 'fr_benzene', 'NumHDonors', 'NumAromaticCarbocycles', 'PEOE_VSA7', 'SlogP_VSA2', 'SlogP_VSA3', 'NHOHCount', 'NOCount', 'fr_NH1', 'EState_VSA4', 'FP_515', 'MaxEStateIndex', 'MaxAbsEStateIndex', 'lap_eig_2', 'PEOE_VSA6', 'VSA_EState5', 'EState_VSA6', 'FractionCSP3', 'EState_VSA3', 'Phi', 'FP_535', 'NumHAcceptors', 'SlogP_VSA4', 'fr_C_O', 'FP_446', 'FP_488', 'SMR_VSA10', 'PEOE_VSA9', 'fr_C_O_noCOO', 'FP_125', 'FP_474', 'SlogP_VSA7', 'graph_diameter', 'SlogP_VSA12', 'FP_507', 'fr_bicyclic', 'MinAbsEStateIndex', 'deg_std']).union(required_descriptors)),\n", "\n", " 'Tc': list(set(['deg_std', 'fr_unbrch_alkane', 'FP_287', 'FP_286', 'betw_mean', 'avg_shortest_path', 'Kappa3', 'graph_diameter', 'FP_285', 'FP_187', 'FP_191', 'FP_139', 'VSA_EState7', 'FpDensityMorgan3', 'FP_143', 'FpDensityMorgan2', 'FP_171', 'Phi', 'FpDensityMorgan1', 'FP_167', 'FP_175', 'qed', 'FP_163', 'FP_131', 'FP_531', 'FP_502', 'FP_142', 'Kappa2', 'FP_135', 'FP_179', 'SlogP_VSA5', 'FP_513', 'FP_134', 'SMR_VSA5', 'FP_93', 'FP_138', 'FP_130', 'RotatableBonds', 'NumRotatableBonds', 'FP_182', 'FP_162', 'FP_518', 'FP_491', 'FP_174', 'FP_496', 'FP_284', 'FP_141', 'FP_475', 'lap_eig_7', 'FP_154', 'lap_eig_8', 'FP_512', 'FP_453', 'FP_256', 'FP_488', 'deg_max', 'FP_170', 'FP_137', 'FP_190', 'FP_17', 'FP_466', 'FP_535', 'FP_474', 'fr_NH1', 'lap_eig_6', 'Chi3n', 'FP_133', 'PEOE_VSA6', 'FP_178', 'FP_186', 'Chi1n', 'FP_450', 'FP_102', 'FP_508', 'EState_VSA5', 'NumHDonors', 'NumAtomStereoCenters', 'NumUnspecifiedAtomStereoCenters', 'NHOHCount', 'FP_183', 'Chi2n', 'Chi3v', 'FP_89', 'SPS', 'betw_max', 'AvgIpc', 'Chi4n', 'Chi1v', 'FP_478', 'lap_eig_5', 'FP_484', 'SMR_VSA3', 'FP_192', 'FP_166', 'Kappa1', 'FP_495', 'FP_526', 'fr_halogen', 'FP_153', 'FP_28']).union(required_descriptors)),\n", "\n", " 'Density': list(set(['SMR_VSA5', 'VSA_EState8', 'VSA_EState7', 'SlogP_VSA5', 'SMR_VSA10', 'FractionCSP3', 'EState_VSA5', 'SlogP_VSA12', 'VSA_EState10', 'fr_unbrch_alkane', 'NumRotatableBonds', 'RotatableBonds', 'FP_119', 'FP_513', 'PEOE_VSA8', 'Kappa3', 'PEOE_VSA7', 'FP_180', 'FP_472', 'FP_428', 'Kappa2', 'FP_80', 'Phi', 'FP_539', 'FP_512', 'FP_531', 'EState_VSA7', 'FP_537', 'FP_502', 'FP_98', 'NumHAcceptors', 'Chi1n', 'MaxAbsEStateIndex', 'MaxEStateIndex', 'PEOE_VSA14', 'FP_500', 'MolLogP', 'LogP', 'FP_465', 'MinAbsEStateIndex', 'Chi2n', 'SlogP_VSA7', 'FP_176', 'avg_shortest_path', 'EState_VSA4', 'FP_181', 'lap_eig_5', 'Chi0n', 'HallKierAlpha', 'PEOE_VSA5', 'qed', 'graph_diameter', 'FP_186', 'betw_mean', 'FP_287', 'FP_179', 'lap_eig_4', 'FP_134', 'Chi3n', 'NOCount', 'fr_C_S', 'FP_131', 'FP_177', 'FP_166', 'FP_127', 'FP_162', 'FP_191', 'FP_143', 'Chi4n', 'TPSA', 'lap_eig_3', 'Chi1v', 'SlogP_VSA6', 'FP_178', 'FP_457', 'FP_139', 'FP_163', 'SMR_VSA7', 'SlogP_VSA11', 'SlogP_VSA3', 'FP_183', 'Chi0', 'FP_137', 'ring_size_6', 'FP_138', 'fr_benzene', 'NumAromaticCarbocycles', 'FP_420', 'NumAromaticRings', 'NumAromaticHeterocycles', 'FP_492', 'FP_169', 'FP_284', 'Chi1', 'FP_141', 'FP_35', 'FP_182', 'FP_521', 'EState_VSA3', 'FP_135']).union(required_descriptors)),\n", "\n", " 'Rg': list(set(['FP_93', 'SlogP_VSA7', 'PEOE_VSA14', 'qed', 'FP_544', 'VSA_EState8', 'FP_499', 'SlogP_VSA1', 'fr_unbrch_alkane', 'FP_42', 'EState_VSA4', 'FP_192', 'FP_508', 'FP_520', 'lap_eig_8', 'Phi', 'FP_155', 'NumAtomStereoCenters', 'NumUnspecifiedAtomStereoCenters', 'avg_shortest_path', 'FP_17', 'FP_317', 'lap_eig_7', 'FP_73', 'VSA_EState7', 'FP_224', 'fr_ester', 'graph_diameter', 'Kappa2', 'NumAmideBonds', 'fr_NH1', 'FP_191', 'fr_amide', 'FP_286', 'Kappa3', 'FP_159', 'FP_488', 'FP_33', 'deg_std', 'FP_280', 'FP_364', 'FP_287', 'EState_VSA5', 'SlogP_VSA5', 'FP_515', 'TPSA', 'FP_151', 'SMR_VSA5', 'FP_498', 'NOCount', 'betw_mean', 'RotatableBonds', 'NumRotatableBonds', 'FP_273', 'SMR_VSA3', 'FP_163', 'FP_134', 'FP_478', 'FP_138', 'FP_187', 'FP_137', 'FP_252', 'VSA_EState3', 'FP_171', 'FP_175', 'lap_eig_6', 'NHOHCount', 'Chi4v', 'FpDensityMorgan1', 'FP_182', 'FP_526', 'FP_167', 'FP_486', 'FP_142', 'FP_316', 'AvgIpc', 'MolLogP', 'LogP', 'FP_183', 'FP_130', 'FP_102', 'FP_1', 'FP_115', 'SMR_VSA10', 'Chi4n', 'FP_24', 'FP_533', 'NumHDonors', 'FP_193', 'FP_147', 'FP_38', 'Chi3n', 'FP_249', 'FP_453', 'FP_535', 'FP_492', 'Chi3v', 'FP_240', 'FP_501', 'FP_139']).union(required_descriptors))\n", "}" ] }, { "cell_type": "code", "execution_count": null, "id": "476b6b17", "metadata": {}, "outputs": [], "source": [ "def smiles_to_combined_fingerprints_with_descriptors(smiles_list, radius=2, n_bits=128):\n", " generator = GetMorganGenerator(radius=radius, fpSize=n_bits)\n", " atom_pair_gen = GetAtomPairGenerator(fpSize=n_bits)\n", " torsion_gen = GetTopologicalTorsionGenerator(fpSize=n_bits)\n", "\n", " fingerprints = []\n", " descriptors = []\n", " valid_smiles = []\n", " invalid_indices = []\n", "\n", " for i, smiles in tqdm(enumerate(smiles_list), total=len(smiles_list), desc=\"šŸ”¬ Data Augmentation\"):\n", " mol = Chem.MolFromSmiles(smiles)\n", " if mol:\n", " # Fingerprints\n", " morgan_fp = generator.GetFingerprint(mol)\n", " atom_pair_fp = atom_pair_gen.GetFingerprint(mol)\n", " torsion_fp = torsion_gen.GetFingerprint(mol)\n", " maccs_fp = MACCSkeys.GenMACCSKeys(mol)\n", "\n", " combined_fp = np.concatenate([\n", " np.array(morgan_fp),\n", " np.array(atom_pair_fp),\n", " np.array(torsion_fp),\n", " np.array(maccs_fp)\n", " ])\n", " fingerprints.append(combined_fp)\n", "\n", " # RDKit Descriptors\n", " descriptor_values = {}\n", " for name, func in Descriptors.descList:\n", " try:\n", " descriptor_values[name] = func(mol)\n", " except:\n", " descriptor_values[name] = None\n", "\n", " # Specific descriptors\n", " descriptor_values['MolWt'] = MolWt(mol)\n", " descriptor_values['LogP'] = MolLogP(mol)\n", " descriptor_values['TPSA'] = CalcTPSA(mol)\n", " descriptor_values['RotatableBonds'] = CalcNumRotatableBonds(mol)\n", " descriptor_values['NumAtoms'] = mol.GetNumAtoms()\n", " descriptor_values['SMILES'] = smiles\n", "\n", " # Graph-based features\n", " try:\n", " adj = rdmolops.GetAdjacencyMatrix(mol)\n", " G = nx.from_numpy_array(adj)\n", "\n", " if nx.is_connected(G):\n", " descriptor_values['graph_diameter'] = nx.diameter(G)\n", " descriptor_values['avg_shortest_path'] = nx.average_shortest_path_length(G)\n", " else:\n", " descriptor_values['graph_diameter'] = 0\n", " descriptor_values['avg_shortest_path'] = 0\n", "\n", " cycles = nx.cycle_basis(G)\n", " descriptor_values['num_cycles'] = len(list(cycles))\n", " sizes = [len(c) for c in cycles]\n", " for k in range(3, 9):\n", " descriptor_values[f'ring_size_{k}'] = sizes.count(k)\n", " except:\n", " descriptor_values['graph_diameter'] = None\n", " descriptor_values['avg_shortest_path'] = None\n", " descriptor_values['num_cycles'] = None\n", " for k in range(3, 9):\n", " descriptor_values[f'ring_size_{k}'] = None\n", " \n", " # Compute Centralities\n", " adj = rdmolops.GetAdjacencyMatrix(mol)\n", " G = nx.from_numpy_array(adj)\n", " deg = dict(nx.degree(G))\n", " bc = nx.betweenness_centrality(G)\n", " cc = nx.clustering(G)\n", " for label, metric in [('deg', deg), ('betw', bc), ('clust', cc)]:\n", " vals = np.array(list(metric.values()), dtype=float)\n", " descriptor_values[f'{label}_mean'] = vals.mean()\n", " descriptor_values[f'{label}_std'] = vals.std()\n", " descriptor_values[f'{label}_max'] = vals.max()\n", " \n", " # Compute Spectral\n", " adj = rdmolops.GetAdjacencyMatrix(mol)\n", " G = nx.from_numpy_array(adj)\n", " L = nx.normalized_laplacian_matrix(G).toarray()\n", " eigs = np.linalg.eigvals(L)\n", " eigs = np.sort(eigs.real)\n", " for i in range(min(k, len(eigs))):\n", " descriptor_values[f'lap_eig_{i+1}'] = eigs[i]\n", " for i in range(len(eigs), k):\n", " descriptor_values[f'lap_eig_{i+1}'] = 0.0\n", "\n", " descriptor_values['Ipc'] = np.log10(descriptor_values['Ipc'])\n", " descriptor_values['lap_eig_1'] = np.sign(descriptor_values['lap_eig_1']) * np.log10(np.abs(descriptor_values['lap_eig_1']) + 1e-20)\n", " \n", " ###\n", " descriptors.append(descriptor_values)\n", " valid_smiles.append(smiles)\n", " else:\n", " fingerprints.append(np.zeros(n_bits * 3 + 167))\n", " descriptors.append(None)\n", " valid_smiles.append(None)\n", " invalid_indices.append(i)\n", "\n", " return np.array(fingerprints), descriptors, valid_smiles, invalid_indices" ] }, { "cell_type": "code", "execution_count": null, "id": "08073585", "metadata": {}, "outputs": [], "source": [ "label = 'Density'\n", "smiles = train_df['SMILES'].tolist()\n", "\n", "fingerprints, descriptors, valid_smiles, invalid_indices = smiles_to_combined_fingerprints_with_descriptors(smiles, radius=2, n_bits=128)\n", "\n", "X = pd.DataFrame(descriptors)\n", "X = X.drop(['BCUT2D_MWLOW','BCUT2D_MWHI','BCUT2D_CHGHI','BCUT2D_CHGLO','BCUT2D_LOGPHI','BCUT2D_LOGPLOW','BCUT2D_MRLOW','BCUT2D_MRHI','MinAbsPartialCharge','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge',],axis=1)\n", "selected = filters[label] # if filters[label] is already a list\n", "X = X.filter(items=selected)\n", "\n", "fp_df = pd.DataFrame(fingerprints, columns=[f'FP_{i}' for i in range(fingerprints.shape[1])]) \n", "print(f\"Fingerprints Shape: {fp_df.shape}\")\n", "\n", "fp_df.reset_index(drop=True, inplace=True)\n", "X.reset_index(drop=True, inplace=True)\n", "X = pd.concat([X, fp_df], axis=1)\n", "\n", "print(f\"After concat: {X.shape}\")\n", "density = X\n", "density.to_csv('Density.csv', index=False)\n" ] } ], "metadata": { "kernelspec": { "display_name": "champs-infer", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.8.20" } }, "nbformat": 4, "nbformat_minor": 5 }