Richard Guo
commited on
Commit
·
19a3899
1
Parent(s):
81aaa4e
limit datum upload to 30k
Browse files- build_map.py +13 -3
build_map.py
CHANGED
|
@@ -115,7 +115,8 @@ def upload_dataset_to_atlas(dataset_dict,
|
|
| 115 |
indexed_field = None,
|
| 116 |
modality=None,
|
| 117 |
organization_name=None,
|
| 118 |
-
wait_for_map=True
|
|
|
|
| 119 |
|
| 120 |
if modality is None:
|
| 121 |
modality = "text"
|
|
@@ -124,7 +125,7 @@ def upload_dataset_to_atlas(dataset_dict,
|
|
| 124 |
unique_id_field_name = "atlas_datum_id"
|
| 125 |
|
| 126 |
if project_name is None:
|
| 127 |
-
project_name = dataset_dict["name"].replace("/", "--")
|
| 128 |
|
| 129 |
desc = f"Config: {dataset_dict['config']}"
|
| 130 |
|
|
@@ -169,13 +170,22 @@ def upload_dataset_to_atlas(dataset_dict,
|
|
| 169 |
batch_size = 1000
|
| 170 |
batched_texts = []
|
| 171 |
|
|
|
|
|
|
|
| 172 |
for split in dataset_dict["splits"]:
|
| 173 |
|
|
|
|
|
|
|
|
|
|
| 174 |
dataset = load_dataset(dataset_dict["name"], dataset_dict["config"], split = split, streaming=True)
|
| 175 |
|
| 176 |
for i, ex in tqdm(enumerate(dataset)):
|
| 177 |
if i % 10000 == 0:
|
| 178 |
time.sleep(2)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
data_to_add = {"split": split, unique_id_field_name: f"{split}_{i}"}
|
| 181 |
|
|
@@ -245,4 +255,4 @@ if __name__ == "__main__":
|
|
| 245 |
project_name = "huggingface_auto_upload_test-dolly-15k"
|
| 246 |
|
| 247 |
dataset_dict = load_dataset_and_metadata(dataset_name)
|
| 248 |
-
print(upload_dataset_to_atlas(dataset_dict, project_name=project_name))
|
|
|
|
| 115 |
indexed_field = None,
|
| 116 |
modality=None,
|
| 117 |
organization_name=None,
|
| 118 |
+
wait_for_map=True,
|
| 119 |
+
datum_limit=30000):
|
| 120 |
|
| 121 |
if modality is None:
|
| 122 |
modality = "text"
|
|
|
|
| 125 |
unique_id_field_name = "atlas_datum_id"
|
| 126 |
|
| 127 |
if project_name is None:
|
| 128 |
+
project_name = dataset_dict["name"].replace("/", "--") + "--hf-atlas-map"
|
| 129 |
|
| 130 |
desc = f"Config: {dataset_dict['config']}"
|
| 131 |
|
|
|
|
| 170 |
batch_size = 1000
|
| 171 |
batched_texts = []
|
| 172 |
|
| 173 |
+
allow_upload = True
|
| 174 |
+
|
| 175 |
for split in dataset_dict["splits"]:
|
| 176 |
|
| 177 |
+
if not allow_upload:
|
| 178 |
+
break
|
| 179 |
+
|
| 180 |
dataset = load_dataset(dataset_dict["name"], dataset_dict["config"], split = split, streaming=True)
|
| 181 |
|
| 182 |
for i, ex in tqdm(enumerate(dataset)):
|
| 183 |
if i % 10000 == 0:
|
| 184 |
time.sleep(2)
|
| 185 |
+
if i == datum_limit:
|
| 186 |
+
print("Datum upload limited to 30,000 points. Stopping upload...")
|
| 187 |
+
allow_upload = False
|
| 188 |
+
break
|
| 189 |
|
| 190 |
data_to_add = {"split": split, unique_id_field_name: f"{split}_{i}"}
|
| 191 |
|
|
|
|
| 255 |
project_name = "huggingface_auto_upload_test-dolly-15k"
|
| 256 |
|
| 257 |
dataset_dict = load_dataset_and_metadata(dataset_name)
|
| 258 |
+
print(upload_dataset_to_atlas(dataset_dict, project_name=project_name))
|