Spaces:

MaxNoichl
/

openalex_mapper

Running on Zero

MaxNoichl commited on Apr 28

Commit

d748d3b

1 Parent(s): 4c3ab20

Enhance data processing in app.py and openalex_utils.py by improving handling of referenced works and filling missing publication values with spaces.

Files changed (2) hide show

app.py CHANGED Viewed

@@ -524,7 +524,8 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
         # Export relevant column
         export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
         export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
-        export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
         if locally_approximate_publication_date_checkbox and plot_time_checkbox:
             export_df['approximate_publication_year'] = local_years
         export_df.to_csv(csv_file_path, index=False)

         # Export relevant column
         export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
         export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
+        export_df['referenced_works'] = [x if isinstance(x, str) else ', '.join(x) if isinstance(x, (list, tuple)) and not pd.isna(x) else '' for x in records_df['referenced_works']]
         if locally_approximate_publication_date_checkbox and plot_time_checkbox:
             export_df['approximate_publication_year'] = local_years
         export_df.to_csv(csv_file_path, index=False)

openalex_utils.py CHANGED Viewed

@@ -99,14 +99,17 @@ def process_records_to_df(records):
             records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
         if 'primary_location' in records_df.columns:
             records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
     else:
         # Process raw records as before
         records_df = pd.DataFrame(records)
         records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
         records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
     # Fill missing values and deduplicate
-    records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
     records_df['abstract'] = records_df['abstract'].fillna(' ')
     records_df['title'] = records_df['title'].fillna(' ')
     records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)

             records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
         if 'primary_location' in records_df.columns:
             records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
+            records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ') # fill missing values with space, only if we have them.
     else:
         # Process raw records as before
         records_df = pd.DataFrame(records)
         records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
         records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
+        records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
     # Fill missing values and deduplicate
     records_df['abstract'] = records_df['abstract'].fillna(' ')
     records_df['title'] = records_df['title'].fillna(' ')
     records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)