Spaces:
Running
on
Zero
Running
on
Zero
Enhance data processing in app.py and openalex_utils.py by improving handling of referenced works and filling missing publication values with spaces.
Browse files- app.py +2 -1
- openalex_utils.py +4 -1
app.py
CHANGED
|
@@ -524,7 +524,8 @@ def predict(request: gr.Request, text_input, sample_size_slider, reduce_sample_c
|
|
| 524 |
# Export relevant column
|
| 525 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
| 526 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
| 527 |
-
export_df['referenced_works'] = [', '.join(x) for x in records_df['referenced_works']]
|
|
|
|
| 528 |
if locally_approximate_publication_date_checkbox and plot_time_checkbox:
|
| 529 |
export_df['approximate_publication_year'] = local_years
|
| 530 |
export_df.to_csv(csv_file_path, index=False)
|
|
|
|
| 524 |
# Export relevant column
|
| 525 |
export_df = records_df[['title', 'abstract', 'doi', 'publication_year', 'x', 'y','id','primary_topic']]
|
| 526 |
export_df['parsed_field'] = [get_field(row) for ix, row in export_df.iterrows()]
|
| 527 |
+
export_df['referenced_works'] = [x if isinstance(x, str) else ', '.join(x) if isinstance(x, (list, tuple)) and not pd.isna(x) else '' for x in records_df['referenced_works']]
|
| 528 |
+
|
| 529 |
if locally_approximate_publication_date_checkbox and plot_time_checkbox:
|
| 530 |
export_df['approximate_publication_year'] = local_years
|
| 531 |
export_df.to_csv(csv_file_path, index=False)
|
openalex_utils.py
CHANGED
|
@@ -99,14 +99,17 @@ def process_records_to_df(records):
|
|
| 99 |
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
|
| 100 |
if 'primary_location' in records_df.columns:
|
| 101 |
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
|
|
|
|
|
|
|
| 102 |
else:
|
| 103 |
# Process raw records as before
|
| 104 |
records_df = pd.DataFrame(records)
|
| 105 |
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
|
| 106 |
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
|
|
|
|
| 107 |
|
| 108 |
# Fill missing values and deduplicate
|
| 109 |
-
|
| 110 |
records_df['abstract'] = records_df['abstract'].fillna(' ')
|
| 111 |
records_df['title'] = records_df['title'].fillna(' ')
|
| 112 |
records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)
|
|
|
|
| 99 |
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
|
| 100 |
if 'primary_location' in records_df.columns:
|
| 101 |
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
|
| 102 |
+
records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ') # fill missing values with space, only if we have them.
|
| 103 |
+
|
| 104 |
else:
|
| 105 |
# Process raw records as before
|
| 106 |
records_df = pd.DataFrame(records)
|
| 107 |
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
|
| 108 |
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
|
| 109 |
+
records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
|
| 110 |
|
| 111 |
# Fill missing values and deduplicate
|
| 112 |
+
|
| 113 |
records_df['abstract'] = records_df['abstract'].fillna(' ')
|
| 114 |
records_df['title'] = records_df['title'].fillna(' ')
|
| 115 |
records_df = records_df.drop_duplicates(subset=['id']).reset_index(drop=True)
|