Spaces:
Sleeping
Sleeping
Commit
·
f44aa18
1
Parent(s):
84bc25a
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +86 -79
code/pdb_featureVector.py
CHANGED
|
@@ -82,8 +82,10 @@ def pdb(input_set, mode, impute):
|
|
| 82 |
out_path = path_to_output_files / 'log.txt'
|
| 83 |
#sys.stdout = open(out_path, 'w')
|
| 84 |
data = clean_data(input_set)
|
|
|
|
| 85 |
data = add_uniprot_sequence(data)
|
| 86 |
match = data[(data.wt_sequence_match == 'm')]
|
|
|
|
| 87 |
iso = data[(data.wt_sequence_match == 'i')]
|
| 88 |
noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')]
|
| 89 |
if len(data) == 0:
|
|
@@ -233,24 +235,28 @@ def pdb(input_set, mode, impute):
|
|
| 233 |
modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
|
| 234 |
modbase = modbase.fillna(np.NaN)
|
| 235 |
print('\n>> Adding Modbase residue positions...\n')
|
| 236 |
-
modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'datapoint']]
|
| 237 |
-
modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'datapoint'])
|
| 238 |
modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
else:
|
| 255 |
modbase = modbase[SIMPLE_COLS]
|
| 256 |
|
|
@@ -266,7 +272,7 @@ def pdb(input_set, mode, impute):
|
|
| 266 |
'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
| 267 |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
|
| 268 |
'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
|
| 269 |
-
|
| 270 |
if len(pdb)>0:
|
| 271 |
pdb = pdb[COLS]
|
| 272 |
pdb['Source'] = 'PDB'
|
|
@@ -282,7 +288,8 @@ def pdb(input_set, mode, impute):
|
|
| 282 |
modbase['Source'] = 'Modbase'
|
| 283 |
else:
|
| 284 |
modbase = pd.DataFrame()
|
| 285 |
-
no_modbase_models_updated
|
|
|
|
| 286 |
|
| 287 |
# st.write('======PDB==========')
|
| 288 |
# st.write(pdb.to_string())
|
|
@@ -291,13 +298,13 @@ def pdb(input_set, mode, impute):
|
|
| 291 |
# st.write('======MODBASE==========')
|
| 292 |
# st.write(modbase.to_string())
|
| 293 |
|
| 294 |
-
|
| 295 |
|
| 296 |
allData = pd.concat([pdb, swiss, modbase])
|
| 297 |
allData.reset_index(inplace=True, drop=True)
|
| 298 |
allData.replace({np.NaN: ''}, inplace=True)
|
| 299 |
-
#
|
| 300 |
-
#
|
| 301 |
if len(allData)>0:
|
| 302 |
allData.distance.replace({-1000: ''}, inplace=True)
|
| 303 |
|
|
@@ -318,52 +325,52 @@ def pdb(input_set, mode, impute):
|
|
| 318 |
k = pd.Series((key, str(list(set(val)))))
|
| 319 |
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
| 320 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
for i in
|
| 325 |
-
if (str(
|
| 326 |
-
|
| 327 |
-
elif (str(
|
| 328 |
-
|
| 329 |
-
elif (str(
|
| 330 |
-
|
| 331 |
-
elif (str(
|
| 332 |
-
|
| 333 |
-
elif
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
|
| 338 |
fisherResult = pd.read_csv(fisher_path, sep='\t')
|
| 339 |
significant_domains = fisherResult.domain.to_list()
|
| 340 |
-
for i in
|
| 341 |
-
if
|
| 342 |
-
|
| 343 |
else:
|
| 344 |
-
|
| 345 |
print('Final adjustments are being done...\n')
|
| 346 |
binaryCols = UNIPROT_ANNOTATION_COLS[-30:]
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
for i in
|
| 350 |
for j in binaryCols:
|
| 351 |
-
|
| 352 |
-
if (
|
| 353 |
-
|
| 354 |
-
elif
|
| 355 |
-
|
| 356 |
-
elif (
|
| 357 |
-
|
| 358 |
|
| 359 |
annotCols = UNIPROT_ANNOTATION_COLS[:30]
|
| 360 |
|
| 361 |
-
for i in
|
| 362 |
for annot in annotCols:
|
| 363 |
binaryName = str(annot) + 'Binary'
|
| 364 |
-
if
|
| 365 |
-
|
| 366 |
-
|
| 367 |
columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
|
| 368 |
'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
|
| 369 |
'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
|
|
@@ -403,7 +410,7 @@ def pdb(input_set, mode, impute):
|
|
| 403 |
'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
|
| 404 |
'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
|
| 405 |
|
| 406 |
-
|
| 407 |
['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity',
|
| 408 |
'volume',
|
| 409 |
'granthamScore', 'domains_all',
|
|
@@ -432,33 +439,33 @@ def pdb(input_set, mode, impute):
|
|
| 432 |
16.82,
|
| 433 |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
|
| 434 |
col_index = 0
|
| 435 |
-
for col_ in
|
| 436 |
-
|
| 437 |
-
|
| 438 |
col_index += 1
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
elif (impute == 'False') or (impute == 'false'):
|
| 443 |
pass
|
| 444 |
-
|
| 445 |
-
|
| 446 |
|
| 447 |
# ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
| 448 |
-
if len(
|
| 449 |
print(
|
| 450 |
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
hours, rem = divmod(end - start, 3600)
|
| 459 |
-
minutes, seconds = divmod(rem, 60)
|
| 460 |
-
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
| 461 |
-
#sys.stdout.close()
|
| 462 |
-
return data
|
| 463 |
-
|
| 464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
out_path = path_to_output_files / 'log.txt'
|
| 83 |
#sys.stdout = open(out_path, 'w')
|
| 84 |
data = clean_data(input_set)
|
| 85 |
+
|
| 86 |
data = add_uniprot_sequence(data)
|
| 87 |
match = data[(data.wt_sequence_match == 'm')]
|
| 88 |
+
org_len = len(match)
|
| 89 |
iso = data[(data.wt_sequence_match == 'i')]
|
| 90 |
noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')]
|
| 91 |
if len(data) == 0:
|
|
|
|
| 235 |
modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True)
|
| 236 |
modbase = modbase.fillna(np.NaN)
|
| 237 |
print('\n>> Adding Modbase residue positions...\n')
|
| 238 |
+
modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']]
|
| 239 |
+
modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint'])
|
| 240 |
modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files)
|
| 241 |
+
|
| 242 |
+
if len(modbaseOut) > 0:
|
| 243 |
+
modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left')
|
| 244 |
+
no_modbase_models_updated['sasa'] = np.NaN
|
| 245 |
+
modbase.reset_index(inplace=True, drop=True)
|
| 246 |
+
no_modbase_add = modbase[pd.isna(modbase.coordinates)]
|
| 247 |
+
modbase = modbase[~pd.isna(modbase.coordinates)]
|
| 248 |
+
no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add])
|
| 249 |
+
print('\n>> Mapping to Modbase models...\n')
|
| 250 |
+
modbase = changeUPtoModels(modbase)
|
| 251 |
+
print('\n>> Calculating 3D distances for Modbase models...\n')
|
| 252 |
+
modbase = isZeroDistance(modbase)
|
| 253 |
+
modbase = match3DModels(modbase)
|
| 254 |
+
modbase = selectMaxAnnot(modbase)
|
| 255 |
+
modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, True, True, True, True])
|
| 256 |
+
modbase = modbase.drop_duplicates(['datapoint'])
|
| 257 |
+
modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True)
|
| 258 |
+
else:
|
| 259 |
+
modbase = pd.DataFrame(columns = SIMPLE_COLS)
|
| 260 |
else:
|
| 261 |
modbase = modbase[SIMPLE_COLS]
|
| 262 |
|
|
|
|
| 272 |
'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
|
| 273 |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary',
|
| 274 |
'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa']
|
| 275 |
+
no_modbase_models_updated = no_modbase_models_updated.drop_duplicates()
|
| 276 |
if len(pdb)>0:
|
| 277 |
pdb = pdb[COLS]
|
| 278 |
pdb['Source'] = 'PDB'
|
|
|
|
| 288 |
modbase['Source'] = 'Modbase'
|
| 289 |
else:
|
| 290 |
modbase = pd.DataFrame()
|
| 291 |
+
if len(no_modbase_models_updated) == 0:
|
| 292 |
+
no_modbase_models_updated = pd.DataFrame()
|
| 293 |
|
| 294 |
# st.write('======PDB==========')
|
| 295 |
# st.write(pdb.to_string())
|
|
|
|
| 298 |
# st.write('======MODBASE==========')
|
| 299 |
# st.write(modbase.to_string())
|
| 300 |
|
| 301 |
+
|
| 302 |
|
| 303 |
allData = pd.concat([pdb, swiss, modbase])
|
| 304 |
allData.reset_index(inplace=True, drop=True)
|
| 305 |
allData.replace({np.NaN: ''}, inplace=True)
|
| 306 |
+
# st.write('======ALL DATA==========')
|
| 307 |
+
# st.write(allData.to_string())
|
| 308 |
if len(allData)>0:
|
| 309 |
allData.distance.replace({-1000: ''}, inplace=True)
|
| 310 |
|
|
|
|
| 325 |
k = pd.Series((key, str(list(set(val)))))
|
| 326 |
interface_dataframe = interface_dataframe.append(k, ignore_index=True)
|
| 327 |
interface_dataframe.columns = ['uniprotID', 'positions']
|
| 328 |
+
final_data = finalTouch(allData)
|
| 329 |
+
final_data = final_data.merge(interface_dataframe, on='uniprotID', how='left')
|
| 330 |
+
final_data.positions = final_data.positions.astype('str')
|
| 331 |
+
for i in final_data.index:
|
| 332 |
+
if (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
|
| 333 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'interface'
|
| 334 |
+
elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface':
|
| 335 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'surface'
|
| 336 |
+
elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
|
| 337 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'core'
|
| 338 |
+
elif (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core':
|
| 339 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'conflict'
|
| 340 |
+
elif final_data.at[i, 'trsh4'] == 'nan':
|
| 341 |
+
final_data.at[i, 'threeState_trsh4_HQ'] = 'nan'
|
| 342 |
+
|
| 343 |
+
final_data.drop(['positions'], axis=1, inplace=True)
|
| 344 |
|
| 345 |
fisherResult = pd.read_csv(fisher_path, sep='\t')
|
| 346 |
significant_domains = fisherResult.domain.to_list()
|
| 347 |
+
for i in final_data.index:
|
| 348 |
+
if final_data.at[i, 'domain'] in significant_domains:
|
| 349 |
+
final_data.at[i, 'domain_fisher'] = final_data.at[i, 'domain']
|
| 350 |
else:
|
| 351 |
+
final_data.at[i, 'domain_fisher'] = 'NULL'
|
| 352 |
print('Final adjustments are being done...\n')
|
| 353 |
binaryCols = UNIPROT_ANNOTATION_COLS[-30:]
|
| 354 |
+
final_data = final_data.astype(str)
|
| 355 |
+
final_data.replace({'NaN': 'nan'}, inplace=True)
|
| 356 |
+
for i in final_data.index:
|
| 357 |
for j in binaryCols:
|
| 358 |
+
final_data[j] = final_data[j].astype('str')
|
| 359 |
+
if (final_data.at[i, j] == '0') or (final_data.at[i, j] == '0.0'):
|
| 360 |
+
final_data.at[i, j] = '1'
|
| 361 |
+
elif final_data.at[i, j] == 'nan':
|
| 362 |
+
final_data.at[i, j] = '0'
|
| 363 |
+
elif (final_data.at[i, j] == '1') or (final_data.at[i, j] == '1.0'):
|
| 364 |
+
final_data.at[i, j] = '2'
|
| 365 |
|
| 366 |
annotCols = UNIPROT_ANNOTATION_COLS[:30]
|
| 367 |
|
| 368 |
+
for i in final_data.index:
|
| 369 |
for annot in annotCols:
|
| 370 |
binaryName = str(annot) + 'Binary'
|
| 371 |
+
if final_data.at[i, binaryName] == '2':
|
| 372 |
+
final_data.at[i, annot] = '0.0'
|
| 373 |
+
final_data.rename(
|
| 374 |
columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue',
|
| 375 |
'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db',
|
| 376 |
'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig',
|
|
|
|
| 410 |
'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist',
|
| 411 |
'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True)
|
| 412 |
|
| 413 |
+
final_data = final_data[
|
| 414 |
['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity',
|
| 415 |
'volume',
|
| 416 |
'granthamScore', 'domains_all',
|
|
|
|
| 439 |
16.82,
|
| 440 |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
|
| 441 |
col_index = 0
|
| 442 |
+
for col_ in final_data.columns[-30:]:
|
| 443 |
+
final_data[col_] = final_data[col_].fillna(filler[col_index])
|
| 444 |
+
final_data[col_] = final_data[col_].replace({'nan': filler[col_index]})
|
| 445 |
col_index += 1
|
| 446 |
+
final_data['domains_3Ddist'] = final_data['domains_3Ddist'].fillna(24.5)
|
| 447 |
+
final_data['sasa'] = final_data['sasa'].fillna(29.5)
|
| 448 |
+
final_data['location_3state'] = final_data['location_3state'].fillna('unknown')
|
| 449 |
elif (impute == 'False') or (impute == 'false'):
|
| 450 |
pass
|
| 451 |
+
final_data = final_data.replace({'nan': np.NaN})
|
| 452 |
+
final_data.domains_all = final_data.domains_all.replace({-1: 'NULL'})
|
| 453 |
|
| 454 |
# ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
| 455 |
+
if len(final_data) == 0:
|
| 456 |
print(
|
| 457 |
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
|
| 458 |
+
final_data.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
|
| 459 |
+
|
| 460 |
+
print('Feature vector successfully created...')
|
| 461 |
+
end = timer()
|
| 462 |
+
hours, rem = divmod(end - start, 3600)
|
| 463 |
+
minutes, seconds = divmod(rem, 60)
|
| 464 |
+
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
|
| 466 |
+
return final_data
|
| 467 |
+
elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len):
|
| 468 |
+
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
| 469 |
+
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.')
|
| 470 |
+
elif len(no_modbase_models_updated) == org_len:
|
| 471 |
+
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.')
|