Upload folder using huggingface_hub
Browse files
flip_vit_b16_s512m_bs16k/checkpoints/epoch_4.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4ef56b70199aff223bcaf5ded3ad1ff024c6e43ad2c47a47618e755b20dfb2ed
|
| 3 |
+
size 1795823122
|
flip_vit_b16_s512m_bs16k/checkpoints/results.jsonl
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"imagenet-zeroshot-val-top1": 0.34178, "imagenet-zeroshot-val-top5": 0.61662}
|
| 2 |
+
{"imagenet-zeroshot-val-top1": 0.43266, "imagenet-zeroshot-val-top5": 0.7194}
|
| 3 |
+
{"imagenet-zeroshot-val-top1": 0.4736, "imagenet-zeroshot-val-top5": 0.75818}
|
| 4 |
+
{"imagenet-zeroshot-val-top1": 0.49378, "imagenet-zeroshot-val-top5": 0.77458}
|
| 5 |
+
{"imagenet-zeroshot-val-top1": 0.55004, "imagenet-zeroshot-val-top5": 0.82504}
|
| 6 |
+
{"imagenet-zeroshot-val-top1": 0.5815, "imagenet-zeroshot-val-top5": 0.84318}
|
| 7 |
+
{"imagenet-zeroshot-val-top1": 0.58112, "imagenet-zeroshot-val-top5": 0.84332}
|
flip_vit_b16_s512m_bs16k/out.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
flip_vit_b16_s512m_bs16k/params.txt
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
NDR_patch_size: 16
|
| 2 |
+
accum_freq: 1
|
| 3 |
+
aug_cfg: {}
|
| 4 |
+
batch_size: 2048
|
| 5 |
+
beta1: 0.9
|
| 6 |
+
beta2: 0.98
|
| 7 |
+
checkpoint_path: ./logs-lr1e-3-datacomp/flip_vit_b16_s512m_bs16k/checkpoints
|
| 8 |
+
coca_caption_loss_weight: 2.0
|
| 9 |
+
coca_contrastive_loss_weight: 1.0
|
| 10 |
+
copy_codebase: False
|
| 11 |
+
csv_caption_key: title
|
| 12 |
+
csv_img_key: filepath
|
| 13 |
+
csv_separator:
|
| 14 |
+
dataset_resampled: False
|
| 15 |
+
dataset_type: webdataset
|
| 16 |
+
ddp_static_graph: True
|
| 17 |
+
debug: False
|
| 18 |
+
delete_prev_step_ckpt: True
|
| 19 |
+
delete_previous_checkpoint: False
|
| 20 |
+
device: cuda:0
|
| 21 |
+
dist_backend: nccl
|
| 22 |
+
dist_url: env://
|
| 23 |
+
distill: False
|
| 24 |
+
distill_model: None
|
| 25 |
+
distill_pretrained: None
|
| 26 |
+
distributed: True
|
| 27 |
+
epochs: 4
|
| 28 |
+
epochs_cooldown: None
|
| 29 |
+
eps: 1e-06
|
| 30 |
+
force_custom_text: False
|
| 31 |
+
force_image_size: 224
|
| 32 |
+
force_patch_dropout: None
|
| 33 |
+
force_quick_gelu: False
|
| 34 |
+
gather_with_grad: True
|
| 35 |
+
global_batch_size: 16384
|
| 36 |
+
grad_checkpointing: True
|
| 37 |
+
grad_clip_norm: None
|
| 38 |
+
horovod: False
|
| 39 |
+
image_interpolation: None
|
| 40 |
+
image_mean: None
|
| 41 |
+
image_resize_mode: None
|
| 42 |
+
image_std: None
|
| 43 |
+
imagenet_v2: None
|
| 44 |
+
imagenet_val: /mnt/bn/zilongdata-hl/dataset/imagenet/val
|
| 45 |
+
is_cls_token: False
|
| 46 |
+
local_loss: True
|
| 47 |
+
local_rank: 0
|
| 48 |
+
lock_image: False
|
| 49 |
+
lock_image_freeze_bn_stats: False
|
| 50 |
+
lock_image_unlocked_groups: 0
|
| 51 |
+
lock_text: False
|
| 52 |
+
lock_text_freeze_layer_norm: False
|
| 53 |
+
lock_text_unlocked_layers: 0
|
| 54 |
+
log_every_n_steps: 128
|
| 55 |
+
log_level: 20
|
| 56 |
+
log_local: False
|
| 57 |
+
log_path: ./logs-lr1e-3-datacomp/flip_vit_b16_s512m_bs16k/out.log
|
| 58 |
+
logs: ./logs-lr1e-3-datacomp
|
| 59 |
+
lr: 0.001
|
| 60 |
+
lr_cooldown_end: 0.0
|
| 61 |
+
lr_cooldown_power: 1.0
|
| 62 |
+
lr_scheduler: cosine
|
| 63 |
+
max_seq_len: 100000000000000
|
| 64 |
+
model: ViT-B-16-FLIP
|
| 65 |
+
name: flip_vit_b16_s512m_bs16k
|
| 66 |
+
native_dynamic_resolution: False
|
| 67 |
+
no_set_device_rank: False
|
| 68 |
+
only_packing: True
|
| 69 |
+
precision: amp
|
| 70 |
+
pretrained:
|
| 71 |
+
pretrained_image:
|
| 72 |
+
pretrained_text:
|
| 73 |
+
rank: 0
|
| 74 |
+
remote_sync: None
|
| 75 |
+
remote_sync_frequency: 300
|
| 76 |
+
remote_sync_protocol: s3
|
| 77 |
+
report_to: wandb
|
| 78 |
+
resume: None
|
| 79 |
+
rope_attn_num_heads: 12
|
| 80 |
+
rope_model_width: 768
|
| 81 |
+
save_every_n_steps: 6104
|
| 82 |
+
save_frequency: 1
|
| 83 |
+
save_most_recent: False
|
| 84 |
+
seed: 0
|
| 85 |
+
siglip: False
|
| 86 |
+
skip_scheduler: False
|
| 87 |
+
tensorboard: False
|
| 88 |
+
tensorboard_path:
|
| 89 |
+
torchcompile: False
|
| 90 |
+
torchscript: False
|
| 91 |
+
trace: False
|
| 92 |
+
train_data: /mnt/bn/zilongdata-hl/dataset/Recap-DataComp-1B-Dataset/{000000..140146}.tar
|
| 93 |
+
train_data_upsampling_factors: None
|
| 94 |
+
train_num_samples: 128000000
|
| 95 |
+
use_bn_sync: False
|
| 96 |
+
use_bnb_linear: None
|
| 97 |
+
val_data: None
|
| 98 |
+
val_frequency: 1
|
| 99 |
+
val_num_samples: None
|
| 100 |
+
val_steps: 6104
|
| 101 |
+
wandb: True
|
| 102 |
+
wandb_notes:
|
| 103 |
+
wandb_project_name: cls-clip-NDR
|
| 104 |
+
warmup: 500
|
| 105 |
+
wd: 0.2
|
| 106 |
+
workers: 6
|
| 107 |
+
world_size: 8
|
| 108 |
+
zeroshot_frequency: 2
|
| 109 |
+
zeroshot_steps: 6104
|