Spaces:

HUBioDataLab
/

ASCARIS

Sleeping

App Files Files Community

fatmacankara commited on Aug 24, 2023

Commit

f4dc3e4

1 Parent(s): 9e94583

Update code/pdb_featureVector.py

Browse files

Files changed (1) hide show

code/pdb_featureVector.py +208 -202

code/pdb_featureVector.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # IMPORT NECESSARY MODULES AND LIBRARIES
 from timeit import default_timer as timer
 import xml.etree.ElementTree as ET
@@ -26,13 +25,13 @@ from Bio.PDB import PDBList
 from Bio import Align
 from Bio import SeqIO
 from Bio.PDB import *
 warnings.filterwarnings("ignore")
 start = timer()
 import streamlit as st
 # FUNCTIONS
 # FUNCTIONS
 from calc_pc_property import *
 from add_domains import *
@@ -58,14 +57,16 @@ def pdb(input_set, mode, impute):
     Add datapoint identifier and remove non-standard input.
     """
     data = clean_data(input_set)
-    path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer =  manage_files(mode)
     out_path = path_to_output_files / 'log.txt'
     sys.stdout = open(out_path, 'w')
     print('Creating directories...')
     annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
                        'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
-                       'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', 'region',
                        'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
                        'transitPeptide', 'glycosylation', 'propeptide']
@@ -140,12 +141,14 @@ def pdb(input_set, mode, impute):
                 if wt == can:
                     data.at[i, 'wt_sequence_match'] = 'm'
                 elif wt != can:
-                    isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
                     for k in isoList:
                         if len(k) >= int(data.at[i, 'pos']):
                             resInIso = k[int(int(data.at[i, 'pos']) - 1)]
                             if wt == resInIso:
-                                whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0]
                                 data.at[i, 'wt_sequence_match'] = 'i'
                                 data.at[i, 'whichIsoform'] = whichIsoform
                                 break
@@ -190,24 +193,16 @@ def pdb(input_set, mode, impute):
         for prot in protein:
             pdbs.append(get_pdb_ids(prot))
         print('PDBs', pdbs)
-        if len(pdbs)>=1:
             print('pdbs not empty')
             pdbs = [item for sublist in pdbs for item in sublist]
             print('NEW', pdbs)
         else:
             print('pdbs empty')
-            pdbs =[]
         print('Processing PDB structures...\n')
         if pdbs == []:
             print('No PDB structure found for the query. ')
-        """
-        try:
-            pdbs = [j.strip('[').strip(']').strip().strip('\'').strip('\"') for j in
-                    ((',').join([str(item) for item in pdbs])).split(',')]
-        except IndexError:
-            pdbs = []
-            print('No PDB structure found for the query. ')
-        """
         print('Starting PDB structures download...\n')
         pdbs = list(filter(None, pdbs))
         pdbs = (set(pdbs))
@@ -219,59 +214,70 @@ def pdb(input_set, mode, impute):
         try:
             shutil.rmtree('obsolete')
         except OSError as e:
-            pass
-        existing_pdb = list(Path(path_to_output_files/'pdb_structures').glob("*"))
-        st.write('existing_pdb')
-        st.write(existing_pdb)
-        existing_pdb = [str(i) for i in existing_pdb]
-        existing_pdb = [i.split('/')[-1].split('.')[0].lower() for i in existing_pdb]
         cnt = 0
         st.write('this is the pdbs', pdbs)
         for search in pdbs:
-            st.write('searching for pdb:', search)
             try:
-                if search.lower() not in existing_pdb:
-                    path_pdb = 'out_files/pdb/pdb_structures'
-                    st.write('path for pdb: ',path_pdb)
-                    file = pdbl.retrieve_pdb_file(search, pdir=path_pdb, file_format="pdb")
-                    st.write('file: ',file)
-                    existing_pdb =  list(Path(path_to_output_files/'pdb_structures').glob("*"))
-                    st.write('after download:', existing_pdb)
-                    st.write(Path(path_to_output_files/'pdb_structures') == path_pdb)
-                    existing_pdb = list(path_pdb.glob("*"))
-                    st.write('after download:', existing_pdb)
-                else:
-                    print('PDB structure file exists..')
-                    for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
-                        filename_replace_ext = filename.with_suffix(".pdb")
-                        filename.rename(filename_replace_ext)
-                    file = Path(path_to_output_files / 'pdb_structures' / f'{search}.pdb')
-                    base = os.path.splitext(str(file))[0]
-                    base = '/'.join(base.split('/')[0:-1]) + '/pdb' + base.split('/')[-1]
-                    os.rename(file, base + ".ent")
-                    file = base + '.ent'
-                resolution_method = parser.get_structure(search, file)
-                for record in SeqIO.parse(file, "pdb-seqres"):
-                    if record.dbxrefs[0].split(':')[0] == 'UNP':
-                        pdb_fasta.at[index, 'pdbID'] = record.id.split(':')[0]
-                        pdb_fasta.at[index, 'chain'] = record.id.split(':')[1]
-                        pdb_fasta.at[index, 'pdbSequence'] = str(record.seq)
-                        pdb_info.at[index, 'uniprotID'] = record.dbxrefs[0].split(':')[1]
-                        pdb_info.at[index, 'pdbID'] = record.id.split(':')[0]
-                        pdb_info.at[index, 'chain'] = record.annotations["chain"]
-                        pdb_info.at[index, 'resolution'] = resolution_method.header['resolution']
-                    index += 1
-            except IndexError as a:
-                st.write(a)
-                pdb_info.at[index, 'uniprotID'] = 'nan'
-                pdb_info.at[index, 'pdbID'] = 'nan'
-                pdb_info.at[index, 'chain'] = 'nan'
-                pdb_info.at[index, 'resolution'] = 'nan'
-            cnt +=1
         print()
         print('PDB file processing finished..')
         for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
             try:
@@ -324,13 +330,11 @@ def pdb(input_set, mode, impute):
                     TypeError
                     with_pdb.at[i, 'pdbInfo'] = 'nan'
-        with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
                              'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
                              'wt_sequence_match',
                              'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
         # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
         # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
         # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
@@ -344,7 +348,8 @@ def pdb(input_set, mode, impute):
         if len(with_pdb) > 0:
             with_pdb = add_annotations(with_pdb)
         else:
-            new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
                                                      'activeSite',
                                                      'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
                                                      'crosslink', 'mutagenesis', 'strand',
@@ -363,7 +368,7 @@ def pdb(input_set, mode, impute):
                                                      'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
                                                      'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
                                                      'glycosylationBinary', 'propeptideBinary']
-            with_pdb = pd.DataFrame(columns = new_cols)
         try:
             with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
         except:
@@ -375,7 +380,7 @@ def pdb(input_set, mode, impute):
         with_pdb.replace({'[]': 'nan'}, inplace=True)
         with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
         with_pdb.replace({'': 'nan'}, inplace=True)
         """
         STEP 7
         Do alignment for PDB
@@ -407,11 +412,11 @@ def pdb(input_set, mode, impute):
         pdb_fasta = None
         pdb_info = None
         pdbs = None
-        existing_pdb = None
         with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
         with_pdb = None
         print('Aligning sequences...\n')
         aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
@@ -434,7 +439,6 @@ def pdb(input_set, mode, impute):
         aligned_m = aligned_m.astype(str)
         aligned_nm = aligned_nm.astype(str)
         frames = [aligned_m, aligned_nm]
         after_up_pdb_alignment = pd.concat(frames, sort=False)
         if len(after_up_pdb_alignment) == 0:
@@ -457,7 +461,6 @@ def pdb(input_set, mode, impute):
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
         print('PDB matching is completed...\n')
         print('SUMMARY')
         print('-------')
@@ -472,7 +475,6 @@ def pdb(input_set, mode, impute):
         print('--%d will be searched in Swiss-Model database.\n' % (
                 len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
         dfM = None
         dfNM = None
         aligned_nm = None
@@ -528,7 +530,8 @@ def pdb(input_set, mode, impute):
             swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
                                       dtype=str, header=None, skiprows=1,
                                       names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
-                                             'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm','seqid', 'url'])
         else:
             swiss_model = pd.DataFrame(
@@ -548,13 +551,13 @@ def pdb(input_set, mode, impute):
                 swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
             else:
                 swiss_model.at[ind, 'whichIsoform'] = 'nan'
-#        swiss_model.drop(['input'], axis=1, inplace=True)
         swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
         print('Index File Processed...\n')
         # Get relevant columns
-        swiss_model = swiss_model[['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
         # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
         swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
         swiss_model.reset_index(inplace=True)
@@ -711,7 +714,6 @@ def pdb(input_set, mode, impute):
                                                                       ascending=[True, False])
         swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
         swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
         swiss_models_with_data.reset_index(inplace=True)
         swiss_models_with_data.drop(['index'], axis=1, inplace=True)
@@ -728,7 +730,6 @@ def pdb(input_set, mode, impute):
         swiss_models_with_data = swiss_models_with_data1.copy()
         swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
         swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
                                                                     axis=0, ascending=[True, True, True, False])
@@ -738,7 +739,8 @@ def pdb(input_set, mode, impute):
                                                                         keep='first')
         swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
         swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
-        len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(broken_swiss.drop_duplicates(['datapoint'])) + len(
             no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
         # This printed data here includes all possible models with different qualities,
         # because we may get a hit in either of them.
@@ -765,10 +767,10 @@ def pdb(input_set, mode, impute):
         swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
         swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
-        swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, path_to_output_files / 'alignment_files')
         swiss_models_with_data = None
         if len(swiss_model_aligned) == 0:
             swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
             swiss_model_aligned['qmean_norm'] = 'nan'
@@ -861,7 +863,7 @@ def pdb(input_set, mode, impute):
                     url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
                     print(url)
                     req = requests.get(url)
-                    name = path_to_output_files / 'modbase_structures' /  f'{protein}.txt'
                     with open(name, 'wb') as f:
                         f.write(req.content)
                 else:
@@ -878,7 +880,7 @@ def pdb(input_set, mode, impute):
                                 individual.write(str('UniProt ID: ' + protein))
                                 individual.write('\n')
                                 individual.write(str(pdb.contents[3])[10:-11].strip())
-                        with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt',
                                   encoding="utf8") as f:
                             fasta = ''
                             chain = ''
@@ -961,7 +963,6 @@ def pdb(input_set, mode, impute):
             existing_modbase_models = None
             existing_modbase_models_ind = None
             model_info_added = model_info_added.drop(['UniprotID'], axis=1)
             model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
                                                                 'PDBCode': 'template', 'PDBChain': 'chain',
@@ -1014,7 +1015,8 @@ def pdb(input_set, mode, impute):
             with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
                                                               axis=0,
                                                               ascending=[True, True, True, True, False, True, False])
-            with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], keep='first')
             with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
             with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
@@ -1028,7 +1030,6 @@ def pdb(input_set, mode, impute):
             with_modbase_info.reset_index(inplace=True)
             with_modbase_info.drop('index', axis=1, inplace=True)
             align = with_modbase_info[
                 with_modbase_info.fasta != 'nan']
             yes_pdb_no_match = with_modbase_info[
@@ -1047,7 +1048,6 @@ def pdb(input_set, mode, impute):
             modbase_aligned = modbase_aligned.astype(str)
             modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
             # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
             if len(with_modbase_info) != 0:
                 not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
@@ -1055,29 +1055,30 @@ def pdb(input_set, mode, impute):
                     ['datapoint'],
                     keep=False)
             else:
-                not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                                       'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                                       'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
-                                                       'intMet',
-                                                       'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                                       'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
-                                                       'crosslink',
-                                                       'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                       'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                       'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
-                                                       'coiledCoil',
-                                                       'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
-                                                       'disulfide',
-                                                       'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
-                                                       'activeSite',
-                                                       'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
-                                                       'crosslink',
-                                                       'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                       'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                       'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
-                                                       'coiledCoil',
-                                                       'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
-                                                       'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
             with_modbase_info = None
             if len(not_in_aligned) != 0:
                 not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
@@ -1094,7 +1095,8 @@ def pdb(input_set, mode, impute):
                 nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
                 not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
                 not_nan.score = not_nan.score.astype(float)
-                not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], inplace=True)
                 not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
                                               ascending=[True, True, False])
@@ -1106,7 +1108,7 @@ def pdb(input_set, mode, impute):
             which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
             if len(which_ones_are_match) == 0:
                 which_ones_are_match = pd.DataFrame(
-                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
                              'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
                              'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
                              'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
@@ -1142,7 +1144,6 @@ def pdb(input_set, mode, impute):
             not_nan = None
             nan = None
             # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
             # No model
@@ -1171,9 +1172,10 @@ def pdb(input_set, mode, impute):
             elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
                 rest = no_info
             else:
-                rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                             'wt_sequence_match', 'whichIsoform', 'datapoint'])
             rest = rest[to_swiss_columns]
             rest = rest.drop_duplicates()
@@ -1185,49 +1187,53 @@ def pdb(input_set, mode, impute):
         else:
-            modbase_match = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                                  'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                                  'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
-                                                  'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                                  'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                                                  'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                  'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                  'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                                  'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
-                                                  'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
-                                                  'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
-                                                  'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
-                                                  'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
-                                                  'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
-                                                  'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
-                                                  'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
-                                                  'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
-                                                  'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
-                                                  'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
-                                                  'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
-                                                  'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
-            not_in_aligned = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                                                   'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                                   'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
-                                                   'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                                   'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                                                   'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                   'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                   'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                                   'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
-                                                   'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
-                                                   'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
-                                                   'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                                   'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                                   'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                                   'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
-                                                   'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
-            no_info = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                            'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                            'wt_sequence_match', 'whichIsoform', 'datapoint'])
-            rest = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
-                                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                         'wt_sequence_match', 'whichIsoform', 'datapoint'])
             rest = rest[to_swiss_columns]
             rest = rest.drop_duplicates()
@@ -1263,7 +1269,6 @@ def pdb(input_set, mode, impute):
         not_models = None
         modbase_not_match = None
         # Final corrections
         # Now 3D alignment.
@@ -1285,7 +1290,6 @@ def pdb(input_set, mode, impute):
         # Fix the axes and  merge all data.
         pdb.drop(['pdbInfo'], axis=1, inplace=True)
         pdb.rename(columns={'resolution': 'score'}, inplace=True)
         swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
@@ -1298,7 +1302,6 @@ def pdb(input_set, mode, impute):
         modbase['source'] = 'MODBASE'
         data = pd.concat([swiss, modbase, pdb])
         data.reset_index(inplace=True)
         data.drop(['index'], axis=1, inplace=True)
         data = data.astype('str')
@@ -1322,10 +1325,10 @@ def pdb(input_set, mode, impute):
         for pdbID in pdb_only.pdbID.to_list():
             if pdbID not in existing_free_sasa:
                 (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
-                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
                               outdir=None, force_rerun=False, file_type='pdb'))
         print('Calculation RSA for SwissModel Files...\n')
         swiss_only = data[data.source == 'SWISSMODEL']
         swiss_dp = []
@@ -1343,7 +1346,8 @@ def pdb(input_set, mode, impute):
         for pdbID in modbase_only.pdbID.to_list():
             if pdbID not in existing_free_sasa:
                 (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
-                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), include_hetatms=True,
                               outdir=None, force_rerun=False, file_type='pdb'))
         # This annotation list is different than the prev one, keep it.
@@ -1381,16 +1385,18 @@ def pdb(input_set, mode, impute):
             chain = data.at[i, 'chain']
             uniprotID = data.at[i, 'uniprotID']
             pdbID = data.at[i, 'pdbID']
-            alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip')
             mutPos = data.at[i, 'mutationPositionOnPDB']
             try:
-                coordMut = get_coords(mutPos, alignments , 'nan', 'nan', mode)[0]
             except:
                 ValueError
                 coordMut = 'nan'
             try:
                 sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
-                data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, data.at[i, 'wt'], mode, path_to_output_files,file_type = 'pdb')
             except:
                 ValueError
                 data.at[i, 'sasa'] = 'nan'  # mutation position is nan
@@ -1438,11 +1444,9 @@ def pdb(input_set, mode, impute):
             data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
                                                  float(data.at[i, 'domainEndonPDB']))
         data = data.astype(str)
         data.replace({'NaN': 'nan'}, inplace=True)
         # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
         # Get interface positions from ECLAIR. Download HQ human
@@ -1463,28 +1467,29 @@ def pdb(input_set, mode, impute):
         interface_dataframe.columns = ['uniprotID', 'positions']
         if len(data) == 0:
-            data = pd.DataFrame(columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume','granthamScore',
-                                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
-                                         'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
-                                         'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
-                                         'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
-                                         'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
-                                         'strand', 'helix', 'turn', 'metalBinding', 'repeat',
-                                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
-                                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
-                                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
-                                         'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
-                                         'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
-                                         'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
-                                         'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
-                                         'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
-                                         'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
-                                         'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
-                                         'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
-                                         'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
-                                         'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
-                                         'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
-                                         'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
         else:
             data.sasa = data.sasa.astype('str')
@@ -1523,7 +1528,6 @@ def pdb(input_set, mode, impute):
         data.drop(['positions'], axis=1, inplace=True)
         # OPTIONAL
         # DOMAIN SELECTION
         # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
@@ -1542,7 +1546,8 @@ def pdb(input_set, mode, impute):
         # nan--> 0, 0 -->1 and 1 -->2
         print('Final adjustments are being done...\n')
-        binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary',
                       'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
                       'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
                       'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
@@ -1644,7 +1649,8 @@ def pdb(input_set, mode, impute):
         ready = data.copy()
         # Imputation
         if (impute == 'True') or (impute == 'true') or (impute == True):
-            filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, 16.82,
                       20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
             col_index = 0
             for col_ in ready.columns[-30:]:
@@ -1659,7 +1665,8 @@ def pdb(input_set, mode, impute):
         ready = ready.replace({'nan': np.NaN})
         ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
         if len(ready) == 0:
-            print('No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
         print(ready)
         print('Feature vector successfully created...')
         return ready
@@ -1669,5 +1676,4 @@ def pdb(input_set, mode, impute):
     minutes, seconds = divmod(rem, 60)
     print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
     sys.stdout.close()
-    return ready

 # IMPORT NECESSARY MODULES AND LIBRARIES
 from timeit import default_timer as timer
 import xml.etree.ElementTree as ET
 from Bio import Align
 from Bio import SeqIO
 from Bio.PDB import *
 warnings.filterwarnings("ignore")
 start = timer()
 import streamlit as st
 # FUNCTIONS
 # FUNCTIONS
 from calc_pc_property import *
 from add_domains import *
     Add datapoint identifier and remove non-standard input.
     """
     data = clean_data(input_set)
+    path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(
+        mode)
     out_path = path_to_output_files / 'log.txt'
     sys.stdout = open(out_path, 'w')
     print('Creating directories...')
     annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
                        'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand',
+                       'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite',
+                       'region',
                        'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide',
                        'transitPeptide', 'glycosylation', 'propeptide']
                 if wt == can:
                     data.at[i, 'wt_sequence_match'] = 'm'
                 elif wt != can:
+                    isoList = isoform_fasta[
+                        isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list()
                     for k in isoList:
                         if len(k) >= int(data.at[i, 'pos']):
                             resInIso = k[int(int(data.at[i, 'pos']) - 1)]
                             if wt == resInIso:
+                                whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[
+                                    0]
                                 data.at[i, 'wt_sequence_match'] = 'i'
                                 data.at[i, 'whichIsoform'] = whichIsoform
                                 break
         for prot in protein:
             pdbs.append(get_pdb_ids(prot))
         print('PDBs', pdbs)
+        if len(pdbs) >= 1:
             print('pdbs not empty')
             pdbs = [item for sublist in pdbs for item in sublist]
             print('NEW', pdbs)
         else:
             print('pdbs empty')
+            pdbs = []
         print('Processing PDB structures...\n')
         if pdbs == []:
             print('No PDB structure found for the query. ')
         print('Starting PDB structures download...\n')
         pdbs = list(filter(None, pdbs))
         pdbs = (set(pdbs))
         try:
             shutil.rmtree('obsolete')
         except OSError as e:
+            pass
         cnt = 0
         st.write('this is the pdbs', pdbs)
+        def fetch_uniprot_ids(pdb_code):
+            try:
+                response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}")
+                response.raise_for_status()  # Check for a successful response
+                data = response.json()
+                st.write(list(list(list(data.values())[0].values())[0].keys()))
+                return list(list(list(data.values())[0].values())[0].keys())
+            except :
+                return []
         for search in pdbs:
+            # Step 1: Fetch the PDB file
+            pdb_url = f"https://files.rcsb.org/download/{search}.pdb"
+            st.write(pdb_url)
             try:
+                response = requests.get(pdb_url)
+                st.write('response', response)
+                response.raise_for_status()  # Check for a successful response
+            except :
+                continue  # Skip to the next PDB code if fetching fails
+            st.write('response2', response)
+            # Step 2: Parse the PDB file from memory
+            pdb_data = response.text
+            pdb_parser = PDBParser(QUIET=True)  # QUIET=True suppresses warnings
+            pdb_file_content = StringIO(pdb_data)
+            structure = pdb_parser.get_structure(pdb_code, pdb_file_content)
+            st.write(structure)
+            ppb = PPBuilder()
+            for model in structure:
+                st.write(model)
+                for pp in ppb.build_peptides(model):
+                    sequence = pp.get_sequence()
+                    st.write(sequence)
+                for chain in model:
+                    chain_id = chain.get_id()
+                    # Extract UniProt ID if available in the chain's annotations
+                    uniprot_ids = fetch_uniprot_ids(search)
+                    # Get the resolution from the PDB header
+                    header = structure.header
+                    resolution = header.get('resolution', 'N/A')
+                    # Print UniProt IDs, chain ID, and resolution for the current model
+                    for i, chain in enumerate(model, start=1):
+                        chain_id = chain.get_id()
+                        st.write(f"---- Information for Chain {chain_id} in Model {i} ----")
+                        st.write(f"UniProt IDs: {', '.join(uniprot_ids)}")
+                        st.write(f"Chain ID: {chain_id}")
+                        st.write(f"PDB ID: {search.upper()}")
+                        st.write(f"Resolution: {resolution}")
+                        st.write(f"Sequence: {sequence}")
+                        pdb_fasta.at[index, 'pdbID'] = search
+                        pdb_fasta.at[index, 'chain'] = chain_id
+                        pdb_fasta.at[index, 'pdbSequence'] = str(sequence)
+                        pdb_info.at[index, 'uniprotID'] = ', '.join(uniprot_ids)
+                        pdb_info.at[index, 'pdbID'] = search
+                        pdb_info.at[index, 'chain'] = chain_id
+                        pdb_info.at[index, 'resolution'] = resolution
+                        index += 1
         print()
+        st.write()
+        st.write(pdb_info)
         print('PDB file processing finished..')
         for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")):
             try:
                     TypeError
                     with_pdb.at[i, 'pdbInfo'] = 'nan'
+        with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
                              'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence',
                              'wt_sequence_match',
                              'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']]
         # If the query data points are found in no_match_in_uniprot data frame, it will not give any results.
         # If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps.
         # If the query data points are found in with_pdb data frame, it will be searched in the following steps.
         if len(with_pdb) > 0:
             with_pdb = add_annotations(with_pdb)
         else:
+            new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant',
+                                                     'dnaBinding',
                                                      'activeSite',
                                                      'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
                                                      'crosslink', 'mutagenesis', 'strand',
                                                      'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
                                                      'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
                                                      'glycosylationBinary', 'propeptideBinary']
+            with_pdb = pd.DataFrame(columns=new_cols)
         try:
             with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str')
         except:
         with_pdb.replace({'[]': 'nan'}, inplace=True)
         with_pdb.replace({'nan-nan': 'nan'}, inplace=True)
         with_pdb.replace({'': 'nan'}, inplace=True)
         """
         STEP 7
         Do alignment for PDB
         pdb_fasta = None
         pdb_info = None
         pdbs = None
+        g_pdb = None
         with_pdb_size = len(with_pdb.drop_duplicates(['datapoint']))
         with_pdb = None
         print('Aligning sequences...\n')
         aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files'))
         aligned_m = aligned_m.astype(str)
         aligned_nm = aligned_nm.astype(str)
         frames = [aligned_m, aligned_nm]
         after_up_pdb_alignment = pd.concat(frames, sort=False)
         if len(after_up_pdb_alignment) == 0:
             (after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')]
         no_pdb = no_pdb.copy()
         print('PDB matching is completed...\n')
         print('SUMMARY')
         print('-------')
         print('--%d will be searched in Swiss-Model database.\n' % (
                 len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint']))))
         dfM = None
         dfNM = None
         aligned_nm = None
             swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t',
                                       dtype=str, header=None, skiprows=1,
                                       names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5',
+                                             'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean',
+                                             'qmean_norm', 'seqid', 'url'])
         else:
             swiss_model = pd.DataFrame(
                 swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1]
             else:
                 swiss_model.at[ind, 'whichIsoform'] = 'nan'
+        #        swiss_model.drop(['input'], axis=1, inplace=True)
         swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL']
         print('Index File Processed...\n')
         # Get relevant columns
+        swiss_model = swiss_model[
+            ['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']]
         # Sort models on qmean score and identity. Some proteins have more than one models, we will pick one.
         swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False)
         swiss_model.reset_index(inplace=True)
                                                                       ascending=[True, False])
         swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template'])
         swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list()))
         swiss_models_with_data.reset_index(inplace=True)
         swiss_models_with_data.drop(['index'], axis=1, inplace=True)
         swiss_models_with_data = swiss_models_with_data1.copy()
         swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float')
         swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'],
                                                                     axis=0, ascending=[True, True, True, False])
                                                                         keep='first')
         swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str')
         swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int')
+        len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len(
+            broken_swiss.drop_duplicates(['datapoint'])) + len(
             no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint']))
         # This printed data here includes all possible models with different qualities,
         # because we may get a hit in either of them.
         swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C')
         swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C')
+        swiss_model_aligned = alignment(swiss_models_with_data, annotation_list,
+                                        path_to_output_files / 'alignment_files')
         swiss_models_with_data = None
         if len(swiss_model_aligned) == 0:
             swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns)
             swiss_model_aligned['qmean_norm'] = 'nan'
                     url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
                     print(url)
                     req = requests.get(url)
+                    name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
                     with open(name, 'wb') as f:
                         f.write(req.content)
                 else:
                                 individual.write(str('UniProt ID: ' + protein))
                                 individual.write('\n')
                                 individual.write(str(pdb.contents[3])[10:-11].strip())
+                        with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt',
                                   encoding="utf8") as f:
                             fasta = ''
                             chain = ''
             existing_modbase_models = None
             existing_modbase_models_ind = None
             model_info_added = model_info_added.drop(['UniprotID'], axis=1)
             model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to',
                                                                 'PDBCode': 'template', 'PDBChain': 'chain',
             with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'],
                                                               axis=0,
                                                               ascending=[True, True, True, True, False, True, False])
+            with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'],
+                                                                  keep='first')
             with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'})
             with_modbase_info = with_modbase_info.replace({'[]': 'nan'})
             with_modbase_info.reset_index(inplace=True)
             with_modbase_info.drop('index', axis=1, inplace=True)
             align = with_modbase_info[
                 with_modbase_info.fasta != 'nan']
             yes_pdb_no_match = with_modbase_info[
             modbase_aligned = modbase_aligned.astype(str)
             modbase_aligned = modbase_aligned.replace({'NaN': 'nan'})
             # Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.)
             if len(with_modbase_info) != 0:
                 not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']),
                     ['datapoint'],
                     keep=False)
             else:
+                not_in_aligned = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                             'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide',
+                             'intMet',
+                             'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                             'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                             'crosslink',
+                             'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                             'coiledCoil',
+                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                             'disulfide',
+                             'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding',
+                             'activeSite',
+                             'nucleotideBinding', 'lipidation', 'site', 'transmembrane',
+                             'crosslink',
+                             'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                             'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                             'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif',
+                             'coiledCoil',
+                             'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
+                             'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
             with_modbase_info = None
             if len(not_in_aligned) != 0:
                 not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']),
                 nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan']
                 not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan']
                 not_nan.score = not_nan.score.astype(float)
+                not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False],
+                                    inplace=True)
                 not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'],
                                               ascending=[True, True, False])
             which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first')
             if len(which_ones_are_match) == 0:
                 which_ones_are_match = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
                              'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
                              'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
                              'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
             not_nan = None
             nan = None
             # merge not_in_align and modbase_not_match as they were both excluded from modbase match.
             # No model
             elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0:
                 rest = no_info
             else:
+                rest = pd.DataFrame(
+                    columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                             'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                             'wt_sequence_match', 'whichIsoform', 'datapoint'])
             rest = rest[to_swiss_columns]
             rest = rest.drop_duplicates()
         else:
+            modbase_match = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
+                         'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                         'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                         'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                         'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
+                         'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                         'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                         'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                         'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                         'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                         'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                         'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                         'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                         'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template',
+                         'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus',
+                         'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB'])
+            not_in_aligned = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet',
+                         'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                         'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                         'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide',
+                         'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite',
+                         'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink',
+                         'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from',
+                         'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta'])
+            no_info = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'wt_sequence_match', 'whichIsoform', 'datapoint'])
+            rest = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'wt_sequence_match', 'whichIsoform', 'datapoint'])
             rest = rest[to_swiss_columns]
             rest = rest.drop_duplicates()
         not_models = None
         modbase_not_match = None
         # Final corrections
         # Now 3D alignment.
         # Fix the axes and  merge all data.
         pdb.drop(['pdbInfo'], axis=1, inplace=True)
         pdb.rename(columns={'resolution': 'score'}, inplace=True)
         swiss.rename(columns={'qmean_norm': 'score'}, inplace=True)
         modbase['source'] = 'MODBASE'
         data = pd.concat([swiss, modbase, pdb])
         data.reset_index(inplace=True)
         data.drop(['index'], axis=1, inplace=True)
         data = data.astype('str')
         for pdbID in pdb_only.pdbID.to_list():
             if pdbID not in existing_free_sasa:
                 (run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'),
+                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
+                              include_hetatms=True,
                               outdir=None, force_rerun=False, file_type='pdb'))
         print('Calculation RSA for SwissModel Files...\n')
         swiss_only = data[data.source == 'SWISSMODEL']
         swiss_dp = []
         for pdbID in modbase_only.pdbID.to_list():
             if pdbID not in existing_free_sasa:
                 (run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'),
+                              Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'),
+                              include_hetatms=True,
                               outdir=None, force_rerun=False, file_type='pdb'))
         # This annotation list is different than the prev one, keep it.
             chain = data.at[i, 'chain']
             uniprotID = data.at[i, 'uniprotID']
             pdbID = data.at[i, 'pdbID']
+            alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode,
+                                           Path(path_to_output_files / '3D_alignment'), file_format='gzip')
             mutPos = data.at[i, 'mutationPositionOnPDB']
             try:
+                coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0]
             except:
                 ValueError
                 coordMut = 'nan'
             try:
                 sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2]
+                data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos,
+                                          data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb')
             except:
                 ValueError
                 data.at[i, 'sasa'] = 'nan'  # mutation position is nan
             data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']),
                                                  float(data.at[i, 'domainEndonPDB']))
         data = data.astype(str)
         data.replace({'NaN': 'nan'}, inplace=True)
         # Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match.
         # Get interface positions from ECLAIR. Download HQ human
         interface_dataframe.columns = ['uniprotID', 'positions']
         if len(data) == 0:
+            data = pd.DataFrame(
+                columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore',
+                         'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence',
+                         'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score',
+                         'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane',
+                         'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding',
+                         'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis',
+                         'strand', 'helix', 'turn', 'metalBinding', 'repeat',
+                         'topologicalDomain', 'caBinding', 'bindingSite', 'region',
+                         'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil',
+                         'peptide', 'transitPeptide', 'glycosylation', 'propeptide',
+                         'disulfideBinary', 'intMetBinary', 'intramembraneBinary',
+                         'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary',
+                         'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
+                         'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
+                         'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
+                         'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary',
+                         'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary',
+                         'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary',
+                         'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary',
+                         'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus',
+                         'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB',
+                         'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher'])
         else:
             data.sasa = data.sasa.astype('str')
         data.drop(['positions'], axis=1, inplace=True)
         # OPTIONAL
         # DOMAIN SELECTION
         # Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most
         # nan--> 0, 0 -->1 and 1 -->2
         print('Final adjustments are being done...\n')
+        binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary',
+                      'dnaBindingBinary',
                       'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary',
                       'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary',
                       'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary',
         ready = data.copy()
         # Imputation
         if (impute == 'True') or (impute == 'true') or (impute == True):
+            filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99,
+                      16.82,
                       20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36]
             col_index = 0
             for col_ in ready.columns[-30:]:
         ready = ready.replace({'nan': np.NaN})
         ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False)
         if len(ready) == 0:
+            print(
+                'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.')
         print(ready)
         print('Feature vector successfully created...')
         return ready
     minutes, seconds = divmod(rem, 60)
     print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
     sys.stdout.close()
+    return ready