Spaces:
Sleeping
Sleeping
Commit
·
ed603e1
1
Parent(s):
082f385
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +41 -31
code/pdb_featureVector.py
CHANGED
|
@@ -230,35 +230,33 @@ def pdb(input_set, mode, impute):
|
|
| 230 |
modbase = no_swiss_models.copy()
|
| 231 |
print('Proceeding to Modbase search...')
|
| 232 |
print('------------------------------------\n')
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
else:
|
| 259 |
-
modbase = pd.DataFrame(columns = SIMPLE_COLS)
|
| 260 |
else:
|
| 261 |
-
modbase =
|
| 262 |
|
| 263 |
else:
|
| 264 |
no_modbase_models_updated = pd.DataFrame()
|
|
@@ -272,7 +270,15 @@ def pdb(input_set, mode, impute):
|
|
| 272 |
'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
| 273 |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
|
| 274 |
'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
|
|
|
|
|
|
|
| 276 |
if len(pdb)>0:
|
| 277 |
pdb = pdb[COLS]
|
| 278 |
pdb['Source'] = 'PDB'
|
|
@@ -288,8 +294,7 @@ def pdb(input_set, mode, impute):
|
|
| 288 |
modbase['Source'] = 'Modbase'
|
| 289 |
else:
|
| 290 |
modbase = pd.DataFrame()
|
| 291 |
-
|
| 292 |
-
no_modbase_models_updated = pd.DataFrame()
|
| 293 |
|
| 294 |
# st.write('======PDB==========')
|
| 295 |
# st.write(pdb.to_string())
|
|
@@ -462,10 +467,15 @@ def pdb(input_set, mode, impute):
|
|
| 462 |
hours, rem = divmod(end - start, 3600)
|
| 463 |
minutes, seconds = divmod(rem, 60)
|
| 464 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
return final_data
|
| 467 |
elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
|
| 468 |
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
| 469 |
-
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins
|
| 470 |
elif len(no_modbase_models_updated) == org_len:
|
| 471 |
-
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
|
|
|
|
| 230 |
modbase = no_swiss_models.copy()
|
| 231 |
print('Proceeding to Modbase search...')
|
| 232 |
print('------------------------------------\n')
|
| 233 |
+
|
| 234 |
+
modbase = modbase[SIMPLE_COLS]
|
| 235 |
+
modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
|
| 236 |
+
modbase = modbase.fillna(np.NaN)
|
| 237 |
+
print('\n>> Adding Modbase residue positions...\n')
|
| 238 |
+
modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']]
|
| 239 |
+
modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint'])
|
| 240 |
+
modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
|
| 241 |
|
| 242 |
+
if len(modbaseOut) > 0:
|
| 243 |
+
modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left')
|
| 244 |
+
no_modbase_models_updated['sasa'] = np.NaN
|
| 245 |
+
modbase.reset_index(inplace=True, drop=True)
|
| 246 |
+
no_modbase_add = modbase[pd.isna(modbase.coordinates)]
|
| 247 |
+
modbase = modbase[~pd.isna(modbase.coordinates)]
|
| 248 |
+
no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
|
| 249 |
+
print('\n>> Mapping to Modbase models...\n')
|
| 250 |
+
modbase = changeUPtoModels(modbase)
|
| 251 |
+
print('\n>> Calculating 3D distances for Modbase models...\n')
|
| 252 |
+
modbase = isZeroDistance(modbase)
|
| 253 |
+
modbase = match3DModels(modbase)
|
| 254 |
+
modbase = selectMaxAnnot(modbase)
|
| 255 |
+
modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, True, True, True, True])
|
| 256 |
+
modbase = modbase.drop_duplicates(['datapoint'])
|
| 257 |
+
modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
|
|
|
|
|
|
|
| 258 |
else:
|
| 259 |
+
modbase = pd.DataFrame(columns = SIMPLE_COLS)
|
| 260 |
|
| 261 |
else:
|
| 262 |
no_modbase_models_updated = pd.DataFrame()
|
|
|
|
| 270 |
'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
| 271 |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
|
| 272 |
'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
|
| 273 |
+
|
| 274 |
+
if len(no_modbase_models_updated) == 0:
|
| 275 |
+
no_modbase_models_updated = pd.DataFrame()
|
| 276 |
+
no_modbase_models_updated = no_modbase_models_updated[~no_modbase_models_updated.datapoint.isin(modbase.datapoint.to_list())]
|
| 277 |
+
no_modbase_models_updated = no_modbase_models_updated[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']]
|
| 278 |
+
no_modbase_models_updated.pos = no_modbase_models_updated.pos.astype(int)
|
| 279 |
no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
|
| 280 |
+
|
| 281 |
+
|
| 282 |
if len(pdb)>0:
|
| 283 |
pdb = pdb[COLS]
|
| 284 |
pdb['Source'] = 'PDB'
|
|
|
|
| 294 |
modbase['Source'] = 'Modbase'
|
| 295 |
else:
|
| 296 |
modbase = pd.DataFrame()
|
| 297 |
+
|
|
|
|
| 298 |
|
| 299 |
# st.write('======PDB==========')
|
| 300 |
# st.write(pdb.to_string())
|
|
|
|
| 467 |
hours, rem = divmod(end - start, 3600)
|
| 468 |
minutes, seconds = divmod(rem, 60)
|
| 469 |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
| 470 |
+
if len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
|
| 471 |
+
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
| 472 |
+
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.')
|
| 473 |
+
elif len(no_modbase_models_updated) == org_len:
|
| 474 |
+
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
|
| 475 |
|
| 476 |
return final_data
|
| 477 |
elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
|
| 478 |
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
| 479 |
+
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.')
|
| 480 |
elif len(no_modbase_models_updated) == org_len:
|
| 481 |
+
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
|