Spaces:

HUBioDataLab
/

ASCARIS

Sleeping

App Files Files Community

fatmacankara commited on Jul 24, 2023

Commit

7c5bea8

1 Parent(s): 80cfda5

Update code/pdb_featureVector.py

Browse files

Files changed (1) hide show

code/pdb_featureVector.py +13 -10

code/pdb_featureVector.py CHANGED Viewed

@@ -172,6 +172,7 @@ def pdb(input_set, mode, impute):
         print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
               % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
                  len(uniprot_matched.drop_duplicates(['datapoint']))))
         """
         STEP 5
@@ -262,6 +263,7 @@ def pdb(input_set, mode, impute):
             cnt +=1
         print()
         print('PDB file processing finished..')
         for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
             try:
                 filename_replace_ext = filename.with_suffix(".pdb")
@@ -325,7 +327,7 @@ def pdb(input_set, mode, impute):
         # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
         # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
         # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
         """
         STEP 6
         Retrieve sequence annotations.
@@ -366,7 +368,7 @@ def pdb(input_set, mode, impute):
         with_pdb.replace({'[]': 'nan'}, inplace=True)
         with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
         with_pdb.replace({'': 'nan'}, inplace=True)
         """
         STEP 7
         Do alignment for PDB
@@ -448,7 +450,7 @@ def pdb(input_set, mode, impute):
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
         print('PDB matching is completed...\n')
         print('SUMMARY')
         print('-------')
@@ -543,7 +545,7 @@ def pdb(input_set, mode, impute):
         swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
         print('Index File Processed...\n')
         # Get relevant columns
         swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
         # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
@@ -623,7 +625,7 @@ def pdb(input_set, mode, impute):
         swiss_model = None
         no_swiss_models = None
         url_nan = None
         # At this point we have:
         # pdb_aligned --- Align in the PDB phase
         # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
@@ -713,7 +715,7 @@ def pdb(input_set, mode, impute):
                 k = pd.Series(swiss_models_with_data.iloc[i])
                 broken_swiss = broken_swiss.append(k, ignore_index=True)
                 c += 1
         if len(broken_swiss) == 0:
             broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
@@ -821,7 +823,7 @@ def pdb(input_set, mode, impute):
         not_nan = None
         which_ones_are_match = None
         swiss_not_match = None
         # STEP :  GO TO MODBASE
         # Should not include anything related to prev models.
         if len(to_modbase) != 0:
@@ -843,7 +845,7 @@ def pdb(input_set, mode, impute):
             modbase_reduced = pd.DataFrame()
             modbase_fasta = pd.DataFrame()
             print('Retrieving ModBase models...\n')
             # Get model files associated with each UniProtID
             for protein in list(set(to_modbase.uniprotID.to_list())):
@@ -919,6 +921,7 @@ def pdb(input_set, mode, impute):
                                 quality_score = -999
             print()
             if len(modbase_fasta) != 0:
                 modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
             else:
@@ -1227,7 +1230,7 @@ def pdb(input_set, mode, impute):
             rest.drop(['index'], axis=1, inplace=True)
             rest = rest.astype('str')
             to_modbase_size = 0
         print('Modbase matching is completed...\n')
         print('SUMMARY')
         print('-------')
@@ -1299,7 +1302,7 @@ def pdb(input_set, mode, impute):
         swiss = None
         modbase = None
         rest = None
         print('Generating FreeSASA files...')
         print('------------------------------------\n')
         # Folder to calculated RSA values.

         print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n'
               % (len(not_match_in_uniprot.drop_duplicates(['datapoint'])),
                  len(uniprot_matched.drop_duplicates(['datapoint']))))
+        st.write('Checkpoint1')
         """
         STEP 5
             cnt +=1
         print()
         print('PDB file processing finished..')
+        st.write('Checkpoint2')
         for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
             try:
                 filename_replace_ext = filename.with_suffix(".pdb")
         # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
         # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
         # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
+        st.write('Checkpoint3')
         """
         STEP 6
         Retrieve sequence annotations.
         with_pdb.replace({'[]': 'nan'}, inplace=True)
         with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
         with_pdb.replace({'': 'nan'}, inplace=True)
+        st.write('Checkpoint4')
         """
         STEP 7
         Do alignment for PDB
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
+        st.write('Checkpoint5')
         print('PDB matching is completed...\n')
         print('SUMMARY')
         print('-------')
         swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
         print('Index File Processed...\n')
+        st.write('Checkpoint6')
         # Get relevant columns
         swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
         # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
         swiss_model = None
         no_swiss_models = None
         url_nan = None
+        st.write('Checkpoint7')
         # At this point we have:
         # pdb_aligned --- Align in the PDB phase
         # not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present.
                 k = pd.Series(swiss_models_with_data.iloc[i])
                 broken_swiss = broken_swiss.append(k, ignore_index=True)
                 c += 1
+        st.write('Checkpoint7')
         if len(broken_swiss) == 0:
             broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list())
         not_nan = None
         which_ones_are_match = None
         swiss_not_match = None
+        st.write('Checkpoint8')
         # STEP :  GO TO MODBASE
         # Should not include anything related to prev models.
         if len(to_modbase) != 0:
             modbase_reduced = pd.DataFrame()
             modbase_fasta = pd.DataFrame()
+            st.write('Checkpoint9')
             print('Retrieving ModBase models...\n')
             # Get model files associated with each UniProtID
             for protein in list(set(to_modbase.uniprotID.to_list())):
                                 quality_score = -999
             print()
+            st.write('Checkpoint10')
             if len(modbase_fasta) != 0:
                 modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta']
             else:
             rest.drop(['index'], axis=1, inplace=True)
             rest = rest.astype('str')
             to_modbase_size = 0
+        st.write('Checkpoint11')
         print('Modbase matching is completed...\n')
         print('SUMMARY')
         print('-------')
         swiss = None
         modbase = None
         rest = None
+        st.write('Checkpoint12')
         print('Generating FreeSASA files...')
         print('------------------------------------\n')
         # Folder to calculated RSA values.