Spaces:
Sleeping
Sleeping
Commit
·
7c5bea8
1
Parent(s):
80cfda5
Update code/pdb_featureVector.py
Browse files- code/pdb_featureVector.py +13 -10
code/pdb_featureVector.py
CHANGED
|
@@ -172,6 +172,7 @@ def pdb(input_set, mode, impute):
|
|
| 172 |
print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
|
| 173 |
% (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
|
| 174 |
len(uniprot_matched.drop_duplicates(['datapoint']))))
|
|
|
|
| 175 |
|
| 176 |
"""
|
| 177 |
STEP 5
|
|
@@ -262,6 +263,7 @@ def pdb(input_set, mode, impute):
|
|
| 262 |
cnt +=1
|
| 263 |
print()
|
| 264 |
print('PDB file processing finished..')
|
|
|
|
| 265 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
| 266 |
try:
|
| 267 |
filename_replace_ext = filename.with_suffix(".pdb")
|
|
@@ -325,7 +327,7 @@ def pdb(input_set, mode, impute):
|
|
| 325 |
# If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
|
| 326 |
# If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
|
| 327 |
# If the query data points are found in with_pdb data frame, it will be searched in the following steps.
|
| 328 |
-
|
| 329 |
"""
|
| 330 |
STEP 6
|
| 331 |
Retrieve sequence annotations.
|
|
@@ -366,7 +368,7 @@ def pdb(input_set, mode, impute):
|
|
| 366 |
with_pdb.replace({'[]': 'nan'}, inplace=True)
|
| 367 |
with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
|
| 368 |
with_pdb.replace({'': 'nan'}, inplace=True)
|
| 369 |
-
|
| 370 |
"""
|
| 371 |
STEP 7
|
| 372 |
Do alignment for PDB
|
|
@@ -448,7 +450,7 @@ def pdb(input_set, mode, impute):
|
|
| 448 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
| 449 |
no_pdb = no_pdb.copy()
|
| 450 |
|
| 451 |
-
|
| 452 |
print('PDB matching is completed...\n')
|
| 453 |
print('SUMMARY')
|
| 454 |
print('-------')
|
|
@@ -543,7 +545,7 @@ def pdb(input_set, mode, impute):
|
|
| 543 |
swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
|
| 544 |
print('Index File Processed...\n')
|
| 545 |
|
| 546 |
-
|
| 547 |
# Get relevant columns
|
| 548 |
swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
|
| 549 |
# Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
|
|
@@ -623,7 +625,7 @@ def pdb(input_set, mode, impute):
|
|
| 623 |
swiss_model = None
|
| 624 |
no_swiss_models = None
|
| 625 |
url_nan = None
|
| 626 |
-
|
| 627 |
# At this point we have:
|
| 628 |
# pdb_aligned --- Align in the PDB phase
|
| 629 |
# not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
|
|
@@ -713,7 +715,7 @@ def pdb(input_set, mode, impute):
|
|
| 713 |
k = pd.Series(swiss_models_with_data.iloc[i])
|
| 714 |
broken_swiss = broken_swiss.append(k, ignore_index=True)
|
| 715 |
c += 1
|
| 716 |
-
|
| 717 |
if len(broken_swiss) == 0:
|
| 718 |
broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
|
| 719 |
|
|
@@ -821,7 +823,7 @@ def pdb(input_set, mode, impute):
|
|
| 821 |
not_nan = None
|
| 822 |
which_ones_are_match = None
|
| 823 |
swiss_not_match = None
|
| 824 |
-
|
| 825 |
# STEP : GO TO MODBASE
|
| 826 |
# Should not include anything related to prev models.
|
| 827 |
if len(to_modbase) != 0:
|
|
@@ -843,7 +845,7 @@ def pdb(input_set, mode, impute):
|
|
| 843 |
|
| 844 |
modbase_reduced = pd.DataFrame()
|
| 845 |
modbase_fasta = pd.DataFrame()
|
| 846 |
-
|
| 847 |
print('Retrieving ModBase models...\n')
|
| 848 |
# Get model files associated with each UniProtID
|
| 849 |
for protein in list(set(to_modbase.uniprotID.to_list())):
|
|
@@ -919,6 +921,7 @@ def pdb(input_set, mode, impute):
|
|
| 919 |
quality_score = -999
|
| 920 |
|
| 921 |
print()
|
|
|
|
| 922 |
if len(modbase_fasta) != 0:
|
| 923 |
modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
|
| 924 |
else:
|
|
@@ -1227,7 +1230,7 @@ def pdb(input_set, mode, impute):
|
|
| 1227 |
rest.drop(['index'], axis=1, inplace=True)
|
| 1228 |
rest = rest.astype('str')
|
| 1229 |
to_modbase_size = 0
|
| 1230 |
-
|
| 1231 |
print('Modbase matching is completed...\n')
|
| 1232 |
print('SUMMARY')
|
| 1233 |
print('-------')
|
|
@@ -1299,7 +1302,7 @@ def pdb(input_set, mode, impute):
|
|
| 1299 |
swiss = None
|
| 1300 |
modbase = None
|
| 1301 |
rest = None
|
| 1302 |
-
|
| 1303 |
print('Generating FreeSASA files...')
|
| 1304 |
print('------------------------------------\n')
|
| 1305 |
# Folder to calculated RSA values.
|
|
|
|
| 172 |
print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
|
| 173 |
% (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
|
| 174 |
len(uniprot_matched.drop_duplicates(['datapoint']))))
|
| 175 |
+
st.write('Checkpoint1')
|
| 176 |
|
| 177 |
"""
|
| 178 |
STEP 5
|
|
|
|
| 263 |
cnt +=1
|
| 264 |
print()
|
| 265 |
print('PDB file processing finished..')
|
| 266 |
+
st.write('Checkpoint2')
|
| 267 |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
|
| 268 |
try:
|
| 269 |
filename_replace_ext = filename.with_suffix(".pdb")
|
|
|
|
| 327 |
# If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
|
| 328 |
# If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
|
| 329 |
# If the query data points are found in with_pdb data frame, it will be searched in the following steps.
|
| 330 |
+
st.write('Checkpoint3')
|
| 331 |
"""
|
| 332 |
STEP 6
|
| 333 |
Retrieve sequence annotations.
|
|
|
|
| 368 |
with_pdb.replace({'[]': 'nan'}, inplace=True)
|
| 369 |
with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
|
| 370 |
with_pdb.replace({'': 'nan'}, inplace=True)
|
| 371 |
+
st.write('Checkpoint4')
|
| 372 |
"""
|
| 373 |
STEP 7
|
| 374 |
Do alignment for PDB
|
|
|
|
| 450 |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
|
| 451 |
no_pdb = no_pdb.copy()
|
| 452 |
|
| 453 |
+
st.write('Checkpoint5')
|
| 454 |
print('PDB matching is completed...\n')
|
| 455 |
print('SUMMARY')
|
| 456 |
print('-------')
|
|
|
|
| 545 |
swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
|
| 546 |
print('Index File Processed...\n')
|
| 547 |
|
| 548 |
+
st.write('Checkpoint6')
|
| 549 |
# Get relevant columns
|
| 550 |
swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
|
| 551 |
# Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
|
|
|
|
| 625 |
swiss_model = None
|
| 626 |
no_swiss_models = None
|
| 627 |
url_nan = None
|
| 628 |
+
st.write('Checkpoint7')
|
| 629 |
# At this point we have:
|
| 630 |
# pdb_aligned --- Align in the PDB phase
|
| 631 |
# not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
|
|
|
|
| 715 |
k = pd.Series(swiss_models_with_data.iloc[i])
|
| 716 |
broken_swiss = broken_swiss.append(k, ignore_index=True)
|
| 717 |
c += 1
|
| 718 |
+
st.write('Checkpoint7')
|
| 719 |
if len(broken_swiss) == 0:
|
| 720 |
broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
|
| 721 |
|
|
|
|
| 823 |
not_nan = None
|
| 824 |
which_ones_are_match = None
|
| 825 |
swiss_not_match = None
|
| 826 |
+
st.write('Checkpoint8')
|
| 827 |
# STEP : GO TO MODBASE
|
| 828 |
# Should not include anything related to prev models.
|
| 829 |
if len(to_modbase) != 0:
|
|
|
|
| 845 |
|
| 846 |
modbase_reduced = pd.DataFrame()
|
| 847 |
modbase_fasta = pd.DataFrame()
|
| 848 |
+
st.write('Checkpoint9')
|
| 849 |
print('Retrieving ModBase models...\n')
|
| 850 |
# Get model files associated with each UniProtID
|
| 851 |
for protein in list(set(to_modbase.uniprotID.to_list())):
|
|
|
|
| 921 |
quality_score = -999
|
| 922 |
|
| 923 |
print()
|
| 924 |
+
st.write('Checkpoint10')
|
| 925 |
if len(modbase_fasta) != 0:
|
| 926 |
modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
|
| 927 |
else:
|
|
|
|
| 1230 |
rest.drop(['index'], axis=1, inplace=True)
|
| 1231 |
rest = rest.astype('str')
|
| 1232 |
to_modbase_size = 0
|
| 1233 |
+
st.write('Checkpoint11')
|
| 1234 |
print('Modbase matching is completed...\n')
|
| 1235 |
print('SUMMARY')
|
| 1236 |
print('-------')
|
|
|
|
| 1302 |
swiss = None
|
| 1303 |
modbase = None
|
| 1304 |
rest = None
|
| 1305 |
+
st.write('Checkpoint12')
|
| 1306 |
print('Generating FreeSASA files...')
|
| 1307 |
print('------------------------------------\n')
|
| 1308 |
# Folder to calculated RSA values.
|