diff --git "a/canary-1b-v2/AudioEncoder.mlmodelc/model.mil" "b/canary-1b-v2/AudioEncoder.mlmodelc/model.mil"
--- "a/canary-1b-v2/AudioEncoder.mlmodelc/model.mil"
+++ "b/canary-1b-v2/AudioEncoder.mlmodelc/model.mil"
@@ -57,25 +57,25 @@ program(1.0)
             tensor<int32, [4]> var_129_pad_0 = const()[name = tensor<string, []>("op_129_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_129_dilations_0 = const()[name = tensor<string, []>("op_129_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_129_groups_0 = const()[name = tensor<string, []>("op_129_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> pre_encode_out_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1047296))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(3144512))), name = tensor<string, []>("pre_encode_out_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
-            tensor<fp16, [1024]> pre_encode_out_inlier_module_bias_to_fp16 = const()[name = tensor<string, []>("pre_encode_out_inlier_module_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(3144640)))];
+            tensor<fp16, [1024, 4096, 1, 1]> pre_encode_out_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1047296))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4193088))), name = tensor<string, []>("pre_encode_out_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024]> pre_encode_out_inlier_module_bias_to_fp16 = const()[name = tensor<string, []>("pre_encode_out_inlier_module_bias_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4193280)))];
             tensor<fp16, [1, 1024, 1, 188]> var_129_cast_fp16 = conv(bias = pre_encode_out_inlier_module_bias_to_fp16, dilations = var_129_dilations_0, groups = var_129_groups_0, pad = var_129_pad_0, pad_type = var_129_pad_type_0, strides = var_129_strides_0, weight = pre_encode_out_inlier_module_weight_to_fp16_palettized, x = input_15_cast_fp16)[name = tensor<string, []>("op_129_cast_fp16")];
             tensor<string, []> var_135_pad_type_0 = const()[name = tensor<string, []>("op_135_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_135_strides_0 = const()[name = tensor<string, []>("op_135_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_135_pad_0 = const()[name = tensor<string, []>("op_135_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_135_dilations_0 = const()[name = tensor<string, []>("op_135_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_135_groups_0 = const()[name = tensor<string, []>("op_135_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> pre_encode_out_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(3304576))), name = tensor<string, []>("pre_encode_out_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [78861]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(3146752))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> pre_encode_out_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4353216))), name = tensor<string, []>("pre_encode_out_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [78861]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4195392))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_135_cast_fp16 = conv(dilations = var_135_dilations_0, groups = var_135_groups_0, pad = var_135_pad_0, pad_type = var_135_pad_type_0, strides = var_135_strides_0, weight = pre_encode_out_outlier_module_weight_to_fp16_sparsified, x = input_15_cast_fp16)[name = tensor<string, []>("op_135_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_1_cast_fp16 = add(x = var_129_cast_fp16, y = var_135_cast_fp16)[name = tensor<string, []>("inputs_1_cast_fp16")];
             tensor<int32, []> var_141 = const()[name = tensor<string, []>("op_141"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_1_axes_0 = const()[name = tensor<string, []>("out_1_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_172_to_fp16 = const()[name = tensor<string, []>("op_172_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_1_cast_fp16 = layer_norm(axes = out_1_axes_0, epsilon = var_172_to_fp16, x = inputs_1_cast_fp16)[name = tensor<string, []>("out_1_cast_fp16")];
-            tensor<fp16, [1024]> input_17_mean_0_to_fp16 = const()[name = tensor<string, []>("input_17_mean_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(3828928)))];
-            tensor<fp16, [1024]> input_17_variance_0_to_fp16 = const()[name = tensor<string, []>("input_17_variance_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(3831040)))];
-            tensor<fp16, [1024]> input_17_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_17_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(3833152)))];
-            tensor<fp16, [1024]> input_17_beta_0_to_fp16 = const()[name = tensor<string, []>("input_17_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(3835264)))];
+            tensor<fp16, [1024]> input_17_mean_0_to_fp16 = const()[name = tensor<string, []>("input_17_mean_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4877568)))];
+            tensor<fp16, [1024]> input_17_variance_0_to_fp16 = const()[name = tensor<string, []>("input_17_variance_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4879680)))];
+            tensor<fp16, [1024]> input_17_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_17_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4881792)))];
+            tensor<fp16, [1024]> input_17_beta_0_to_fp16 = const()[name = tensor<string, []>("input_17_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4883904)))];
             tensor<fp16, []> input_17_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_17_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_17_cast_fp16 = batch_norm(beta = input_17_beta_0_to_fp16, epsilon = input_17_epsilon_0_to_fp16, gamma = input_17_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_1_cast_fp16)[name = tensor<string, []>("input_17_cast_fp16")];
             tensor<string, []> var_192_pad_type_0 = const()[name = tensor<string, []>("op_192_pad_type_0"), val = tensor<string, []>("valid")];
@@ -83,15 +83,15 @@ program(1.0)
             tensor<int32, [4]> var_192_pad_0 = const()[name = tensor<string, []>("op_192_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_192_dilations_0 = const()[name = tensor<string, []>("op_192_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_192_groups_0 = const()[name = tensor<string, []>("op_192_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_0_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(3837376))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5934592))), name = tensor<string, []>("layers_0_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
-            tensor<fp16, [4096]> layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16 = const()[name = tensor<string, []>("layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16"), val = tensor<fp16, [4096]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5934720)))];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_0_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4886016))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8031808))), name = tensor<string, []>("layers_0_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096]> layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16 = const()[name = tensor<string, []>("layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16"), val = tensor<fp16, [4096]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8032000)))];
             tensor<fp16, [1, 4096, 1, 188]> var_192_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_192_dilations_0, groups = var_192_groups_0, pad = var_192_pad_0, pad_type = var_192_pad_type_0, strides = var_192_strides_0, weight = layers_0_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_17_cast_fp16)[name = tensor<string, []>("op_192_cast_fp16")];
             tensor<string, []> var_198_pad_type_0 = const()[name = tensor<string, []>("op_198_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_198_strides_0 = const()[name = tensor<string, []>("op_198_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_198_pad_0 = const()[name = tensor<string, []>("op_198_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_198_dilations_0 = const()[name = tensor<string, []>("op_198_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_198_groups_0 = const()[name = tensor<string, []>("op_198_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_0_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6103104))), name = tensor<string, []>("layers_0_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [80009]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(5942976))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_0_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8200384))), name = tensor<string, []>("layers_0_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [80009]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8040256))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_198_cast_fp16 = conv(dilations = var_198_dilations_0, groups = var_198_groups_0, pad = var_198_pad_0, pad_type = var_198_pad_type_0, strides = var_198_strides_0, weight = layers_0_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_17_cast_fp16)[name = tensor<string, []>("op_198_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_19_cast_fp16 = add(x = var_192_cast_fp16, y = var_198_cast_fp16)[name = tensor<string, []>("input_19_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_21_cast_fp16 = silu(x = input_19_cast_fp16)[name = tensor<string, []>("input_21_cast_fp16")];
@@ -100,14 +100,14 @@ program(1.0)
             tensor<int32, [4]> var_209_pad_0 = const()[name = tensor<string, []>("op_209_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_209_dilations_0 = const()[name = tensor<string, []>("op_209_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_209_groups_0 = const()[name = tensor<string, []>("op_209_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_0_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(6627456))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8724672))), name = tensor<string, []>("layers_0_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_0_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8724736))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11870528))), name = tensor<string, []>("layers_0_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_209_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_209_dilations_0, groups = var_209_groups_0, pad = var_209_pad_0, pad_type = var_209_pad_type_0, strides = var_209_strides_0, weight = layers_0_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_21_cast_fp16)[name = tensor<string, []>("op_209_cast_fp16")];
             tensor<string, []> var_215_pad_type_0 = const()[name = tensor<string, []>("op_215_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_215_strides_0 = const()[name = tensor<string, []>("op_215_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_215_pad_0 = const()[name = tensor<string, []>("op_215_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_215_dilations_0 = const()[name = tensor<string, []>("op_215_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_215_groups_0 = const()[name = tensor<string, []>("op_215_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_0_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8880128))), name = tensor<string, []>("layers_0_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [77618]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(8724800))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_0_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12026048))), name = tensor<string, []>("layers_0_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [77618]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11870720))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_215_cast_fp16 = conv(dilations = var_215_dilations_0, groups = var_215_groups_0, pad = var_215_pad_0, pad_type = var_215_pad_type_0, strides = var_215_strides_0, weight = layers_0_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_21_cast_fp16)[name = tensor<string, []>("op_215_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_3_cast_fp16 = add(x = var_209_cast_fp16, y = var_215_cast_fp16)[name = tensor<string, []>("x_3_cast_fp16")];
             tensor<fp16, []> var_217_to_fp16 = const()[name = tensor<string, []>("op_217_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -116,8 +116,8 @@ program(1.0)
             tensor<int32, [1]> out_3_axes_0 = const()[name = tensor<string, []>("out_3_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_228_to_fp16 = const()[name = tensor<string, []>("op_228_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_3_cast_fp16 = layer_norm(axes = out_3_axes_0, epsilon = var_228_to_fp16, x = inputs_3_cast_fp16)[name = tensor<string, []>("out_3_cast_fp16")];
-            tensor<fp16, [1024]> obj_1_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_1_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9404480)))];
-            tensor<fp16, [1024]> obj_1_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_1_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9406592)))];
+            tensor<fp16, [1024]> obj_1_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_1_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12550400)))];
+            tensor<fp16, [1024]> obj_1_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_1_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12552512)))];
             tensor<fp16, []> obj_1_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_1_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_1_cast_fp16 = batch_norm(beta = obj_1_beta_0_to_fp16, epsilon = obj_1_epsilon_0_to_fp16, gamma = obj_1_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_3_cast_fp16)[name = tensor<string, []>("obj_1_cast_fp16")];
             tensor<string, []> var_253_pad_type_0 = const()[name = tensor<string, []>("op_253_pad_type_0"), val = tensor<string, []>("valid")];
@@ -125,14 +125,14 @@ program(1.0)
             tensor<int32, [4]> var_253_pad_0 = const()[name = tensor<string, []>("op_253_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_253_dilations_0 = const()[name = tensor<string, []>("op_253_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_253_groups_0 = const()[name = tensor<string, []>("op_253_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9408704))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9933056))), name = tensor<string, []>("layers_0_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12554624))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13341120))), name = tensor<string, []>("layers_0_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_253_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_253_dilations_0, groups = var_253_groups_0, pad = var_253_pad_0, pad_type = var_253_pad_type_0, strides = var_253_strides_0, weight = layers_0_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = tensor<string, []>("op_253_cast_fp16")];
             tensor<string, []> var_259_pad_type_0 = const()[name = tensor<string, []>("op_259_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_259_strides_0 = const()[name = tensor<string, []>("op_259_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_259_pad_0 = const()[name = tensor<string, []>("op_259_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_259_dilations_0 = const()[name = tensor<string, []>("op_259_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_259_groups_0 = const()[name = tensor<string, []>("op_259_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9971648))), name = tensor<string, []>("layers_0_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19185]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(9933184))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13379776))), name = tensor<string, []>("layers_0_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19185]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13341312))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_259_cast_fp16 = conv(dilations = var_259_dilations_0, groups = var_259_groups_0, pad = var_259_pad_0, pad_type = var_259_pad_type_0, strides = var_259_strides_0, weight = layers_0_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_1_cast_fp16)[name = tensor<string, []>("op_259_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_1_cast_fp16 = add(x = var_253_cast_fp16, y = var_259_cast_fp16)[name = tensor<string, []>("query_1_cast_fp16")];
             tensor<string, []> var_268_pad_type_0 = const()[name = tensor<string, []>("op_268_pad_type_0"), val = tensor<string, []>("valid")];
@@ -140,14 +140,14 @@ program(1.0)
             tensor<int32, [4]> var_268_pad_0 = const()[name = tensor<string, []>("op_268_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_268_dilations_0 = const()[name = tensor<string, []>("op_268_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_268_groups_0 = const()[name = tensor<string, []>("op_268_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10102784))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10627136))), name = tensor<string, []>("layers_0_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13510912))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14297408))), name = tensor<string, []>("layers_0_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_268_cast_fp16 = conv(dilations = var_268_dilations_0, groups = var_268_groups_0, pad = var_268_pad_0, pad_type = var_268_pad_type_0, strides = var_268_strides_0, weight = layers_0_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = tensor<string, []>("op_268_cast_fp16")];
             tensor<string, []> var_274_pad_type_0 = const()[name = tensor<string, []>("op_274_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_274_strides_0 = const()[name = tensor<string, []>("op_274_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_274_pad_0 = const()[name = tensor<string, []>("op_274_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_274_dilations_0 = const()[name = tensor<string, []>("op_274_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_274_groups_0 = const()[name = tensor<string, []>("op_274_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10673792))), name = tensor<string, []>("layers_0_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [23226]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10627264))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14344128))), name = tensor<string, []>("layers_0_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [23226]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14297600))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_274_cast_fp16 = conv(dilations = var_274_dilations_0, groups = var_274_groups_0, pad = var_274_pad_0, pad_type = var_274_pad_type_0, strides = var_274_strides_0, weight = layers_0_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_1_cast_fp16)[name = tensor<string, []>("op_274_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_1_cast_fp16 = add(x = var_268_cast_fp16, y = var_274_cast_fp16)[name = tensor<string, []>("key_1_cast_fp16")];
             tensor<string, []> var_284_pad_type_0 = const()[name = tensor<string, []>("op_284_pad_type_0"), val = tensor<string, []>("valid")];
@@ -155,33 +155,33 @@ program(1.0)
             tensor<int32, [4]> var_284_pad_0 = const()[name = tensor<string, []>("op_284_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_284_dilations_0 = const()[name = tensor<string, []>("op_284_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_284_groups_0 = const()[name = tensor<string, []>("op_284_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(10804928))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11329280))), name = tensor<string, []>("layers_0_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14475264))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15261760))), name = tensor<string, []>("layers_0_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_284_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_284_dilations_0, groups = var_284_groups_0, pad = var_284_pad_0, pad_type = var_284_pad_type_0, strides = var_284_strides_0, weight = layers_0_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_1_cast_fp16)[name = tensor<string, []>("op_284_cast_fp16")];
             tensor<string, []> var_290_pad_type_0 = const()[name = tensor<string, []>("op_290_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_290_strides_0 = const()[name = tensor<string, []>("op_290_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_290_pad_0 = const()[name = tensor<string, []>("op_290_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_290_dilations_0 = const()[name = tensor<string, []>("op_290_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_290_groups_0 = const()[name = tensor<string, []>("op_290_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11371904))), name = tensor<string, []>("layers_0_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [21200]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11329408))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15304448))), name = tensor<string, []>("layers_0_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [21200]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15261952))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_290_cast_fp16 = conv(dilations = var_290_dilations_0, groups = var_290_groups_0, pad = var_290_pad_0, pad_type = var_290_pad_type_0, strides = var_290_strides_0, weight = layers_0_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_1_cast_fp16)[name = tensor<string, []>("op_290_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_1_cast_fp16 = add(x = var_284_cast_fp16, y = var_290_cast_fp16)[name = tensor<string, []>("value_1_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_293_to_fp16 = const()[name = tensor<string, []>("op_293_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11503040)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_293_to_fp16 = const()[name = tensor<string, []>("op_293_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15435584)))];
             tensor<fp16, [1, 1024, 1, 188]> query_3_cast_fp16 = add(x = query_1_cast_fp16, y = var_293_to_fp16)[name = tensor<string, []>("query_3_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_296_to_fp16 = const()[name = tensor<string, []>("op_296_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11505152)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_296_to_fp16 = const()[name = tensor<string, []>("op_296_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15437696)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_1_cast_fp16 = add(x = query_1_cast_fp16, y = var_296_to_fp16)[name = tensor<string, []>("q_with_bias_v_1_cast_fp16")];
             tensor<string, []> var_306_pad_type_0 = const()[name = tensor<string, []>("op_306_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_306_strides_0 = const()[name = tensor<string, []>("op_306_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_306_pad_0 = const()[name = tensor<string, []>("op_306_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_306_dilations_0 = const()[name = tensor<string, []>("op_306_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_306_groups_0 = const()[name = tensor<string, []>("op_306_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(11507264))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12031616))), name = tensor<string, []>("layers_0_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15439808))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16226304))), name = tensor<string, []>("layers_0_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_306_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_306_dilations_0, groups = var_306_groups_0, pad = var_306_pad_0, pad_type = var_306_pad_type_0, strides = var_306_strides_0, weight = layers_0_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_306_cast_fp16")];
             tensor<string, []> var_312_pad_type_0 = const()[name = tensor<string, []>("op_312_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_312_strides_0 = const()[name = tensor<string, []>("op_312_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_312_pad_0 = const()[name = tensor<string, []>("op_312_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_312_dilations_0 = const()[name = tensor<string, []>("op_312_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_312_groups_0 = const()[name = tensor<string, []>("op_312_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12113664))), name = tensor<string, []>("layers_0_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [40913]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12031744))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16308416))), name = tensor<string, []>("layers_0_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [40913]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16226496))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_312_cast_fp16 = conv(dilations = var_312_dilations_0, groups = var_312_groups_0, pad = var_312_pad_0, pad_type = var_312_pad_type_0, strides = var_312_strides_0, weight = layers_0_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_312_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_1_cast_fp16 = add(x = var_306_cast_fp16, y = var_312_cast_fp16)[name = tensor<string, []>("p_1_cast_fp16")];
             tensor<int32, [4]> var_316 = const()[name = tensor<string, []>("op_316"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -232,22 +232,22 @@ program(1.0)
             tensor<int32, [4]> var_369_pad_0 = const()[name = tensor<string, []>("op_369_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_369_dilations_0 = const()[name = tensor<string, []>("op_369_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_369_groups_0 = const()[name = tensor<string, []>("op_369_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12244800))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12769152))), name = tensor<string, []>("layers_0_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(16439552))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17226048))), name = tensor<string, []>("layers_0_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_369_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_369_dilations_0, groups = var_369_groups_0, pad = var_369_pad_0, pad_type = var_369_pad_type_0, strides = var_369_strides_0, weight = layers_0_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_23_cast_fp16)[name = tensor<string, []>("op_369_cast_fp16")];
             tensor<string, []> var_375_pad_type_0 = const()[name = tensor<string, []>("op_375_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_375_strides_0 = const()[name = tensor<string, []>("op_375_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_375_pad_0 = const()[name = tensor<string, []>("op_375_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_375_dilations_0 = const()[name = tensor<string, []>("op_375_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_375_groups_0 = const()[name = tensor<string, []>("op_375_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12808448))), name = tensor<string, []>("layers_0_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19534]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12769280))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17265408))), name = tensor<string, []>("layers_0_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19534]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17226240))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_375_cast_fp16 = conv(dilations = var_375_dilations_0, groups = var_375_groups_0, pad = var_375_pad_0, pad_type = var_375_pad_type_0, strides = var_375_strides_0, weight = layers_0_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_23_cast_fp16)[name = tensor<string, []>("op_375_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_5_cast_fp16 = add(x = var_369_cast_fp16, y = var_375_cast_fp16)[name = tensor<string, []>("obj_5_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_5_cast_fp16 = add(x = inputs_3_cast_fp16, y = obj_5_cast_fp16)[name = tensor<string, []>("inputs_5_cast_fp16")];
             tensor<int32, [1]> out_5_axes_0 = const()[name = tensor<string, []>("out_5_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_386_to_fp16 = const()[name = tensor<string, []>("op_386_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_5_cast_fp16 = layer_norm(axes = out_5_axes_0, epsilon = var_386_to_fp16, x = inputs_5_cast_fp16)[name = tensor<string, []>("out_5_cast_fp16")];
-            tensor<fp16, [1024]> input_25_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_25_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12939584)))];
-            tensor<fp16, [1024]> input_25_beta_0_to_fp16 = const()[name = tensor<string, []>("input_25_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12941696)))];
+            tensor<fp16, [1024]> input_25_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_25_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17396544)))];
+            tensor<fp16, [1024]> input_25_beta_0_to_fp16 = const()[name = tensor<string, []>("input_25_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17398656)))];
             tensor<fp16, []> input_25_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_25_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_25_cast_fp16 = batch_norm(beta = input_25_beta_0_to_fp16, epsilon = input_25_epsilon_0_to_fp16, gamma = input_25_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_5_cast_fp16)[name = tensor<string, []>("input_25_cast_fp16")];
             tensor<string, []> var_407_pad_type_0 = const()[name = tensor<string, []>("op_407_pad_type_0"), val = tensor<string, []>("valid")];
@@ -255,14 +255,14 @@ program(1.0)
             tensor<int32, [4]> var_407_pad_0 = const()[name = tensor<string, []>("op_407_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_407_dilations_0 = const()[name = tensor<string, []>("op_407_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_407_groups_0 = const()[name = tensor<string, []>("op_407_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_0_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(12943808))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13992448))), name = tensor<string, []>("layers_0_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_0_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17400768))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18973696))), name = tensor<string, []>("layers_0_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_407_cast_fp16 = conv(dilations = var_407_dilations_0, groups = var_407_groups_0, pad = var_407_pad_0, pad_type = var_407_pad_type_0, strides = var_407_strides_0, weight = layers_0_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_25_cast_fp16)[name = tensor<string, []>("op_407_cast_fp16")];
             tensor<string, []> var_413_pad_type_0 = const()[name = tensor<string, []>("op_413_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_413_strides_0 = const()[name = tensor<string, []>("op_413_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_413_pad_0 = const()[name = tensor<string, []>("op_413_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_413_dilations_0 = const()[name = tensor<string, []>("op_413_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_413_groups_0 = const()[name = tensor<string, []>("op_413_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_0_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14062592))), name = tensor<string, []>("layers_0_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [34959]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(13992576))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_0_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19043904))), name = tensor<string, []>("layers_0_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [34959]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(18973888))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_413_cast_fp16 = conv(dilations = var_413_dilations_0, groups = var_413_groups_0, pad = var_413_pad_0, pad_type = var_413_pad_type_0, strides = var_413_strides_0, weight = layers_0_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_25_cast_fp16)[name = tensor<string, []>("op_413_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_27_cast_fp16 = add(x = var_407_cast_fp16, y = var_413_cast_fp16)[name = tensor<string, []>("input_27_cast_fp16")];
             tensor<int32, []> input_29_split_num_splits_0 = const()[name = tensor<string, []>("input_29_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -275,8 +275,8 @@ program(1.0)
             tensor<int32, []> input_31_groups_0 = const()[name = tensor<string, []>("input_31_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_31_strides_0 = const()[name = tensor<string, []>("input_31_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_31_dilations_0 = const()[name = tensor<string, []>("input_31_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_268_to_fp16 = const()[name = tensor<string, []>("const_268_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14324800)))];
-            tensor<fp16, [1024]> const_269_to_fp16 = const()[name = tensor<string, []>("const_269_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14343296)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_268_to_fp16 = const()[name = tensor<string, []>("const_268_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19306112)))];
+            tensor<fp16, [1024]> const_269_to_fp16 = const()[name = tensor<string, []>("const_269_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19324608)))];
             tensor<fp16, [1, 1024, 1, 188]> input_33_cast_fp16 = conv(bias = const_269_to_fp16, dilations = input_31_dilations_0, groups = input_31_groups_0, pad = input_31_pad_0, pad_type = input_31_pad_type_0, strides = input_31_strides_0, weight = const_268_to_fp16, x = input_29_cast_fp16)[name = tensor<string, []>("input_33_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_35_cast_fp16 = silu(x = input_33_cast_fp16)[name = tensor<string, []>("input_35_cast_fp16")];
             tensor<string, []> var_435_pad_type_0 = const()[name = tensor<string, []>("op_435_pad_type_0"), val = tensor<string, []>("valid")];
@@ -284,22 +284,22 @@ program(1.0)
             tensor<int32, [4]> var_435_pad_0 = const()[name = tensor<string, []>("op_435_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_435_dilations_0 = const()[name = tensor<string, []>("op_435_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_435_groups_0 = const()[name = tensor<string, []>("op_435_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14345408))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14869760))), name = tensor<string, []>("layers_0_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19326720))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20113216))), name = tensor<string, []>("layers_0_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_435_cast_fp16 = conv(dilations = var_435_dilations_0, groups = var_435_groups_0, pad = var_435_pad_0, pad_type = var_435_pad_type_0, strides = var_435_strides_0, weight = layers_0_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_35_cast_fp16)[name = tensor<string, []>("op_435_cast_fp16")];
             tensor<string, []> var_441_pad_type_0 = const()[name = tensor<string, []>("op_441_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_441_strides_0 = const()[name = tensor<string, []>("op_441_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_441_pad_0 = const()[name = tensor<string, []>("op_441_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_441_dilations_0 = const()[name = tensor<string, []>("op_441_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_441_groups_0 = const()[name = tensor<string, []>("op_441_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_0_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14904640))), name = tensor<string, []>("layers_0_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17334]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(14869888))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_0_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20148160))), name = tensor<string, []>("layers_0_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17334]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20113408))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_441_cast_fp16 = conv(dilations = var_441_dilations_0, groups = var_441_groups_0, pad = var_441_pad_0, pad_type = var_441_pad_type_0, strides = var_441_strides_0, weight = layers_0_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_35_cast_fp16)[name = tensor<string, []>("op_441_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_5_cast_fp16 = add(x = var_435_cast_fp16, y = var_441_cast_fp16)[name = tensor<string, []>("x_5_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_7_cast_fp16 = add(x = inputs_5_cast_fp16, y = x_5_cast_fp16)[name = tensor<string, []>("inputs_7_cast_fp16")];
             tensor<int32, [1]> out_7_axes_0 = const()[name = tensor<string, []>("out_7_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_452_to_fp16 = const()[name = tensor<string, []>("op_452_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_7_cast_fp16 = layer_norm(axes = out_7_axes_0, epsilon = var_452_to_fp16, x = inputs_7_cast_fp16)[name = tensor<string, []>("out_7_cast_fp16")];
-            tensor<fp16, [1024]> input_37_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_37_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15035776)))];
-            tensor<fp16, [1024]> input_37_beta_0_to_fp16 = const()[name = tensor<string, []>("input_37_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15037888)))];
+            tensor<fp16, [1024]> input_37_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_37_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20279296)))];
+            tensor<fp16, [1024]> input_37_beta_0_to_fp16 = const()[name = tensor<string, []>("input_37_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20281408)))];
             tensor<fp16, []> input_37_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_37_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_37_cast_fp16 = batch_norm(beta = input_37_beta_0_to_fp16, epsilon = input_37_epsilon_0_to_fp16, gamma = input_37_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_7_cast_fp16)[name = tensor<string, []>("input_37_cast_fp16")];
             tensor<string, []> var_472_pad_type_0 = const()[name = tensor<string, []>("op_472_pad_type_0"), val = tensor<string, []>("valid")];
@@ -307,14 +307,14 @@ program(1.0)
             tensor<int32, [4]> var_472_pad_0 = const()[name = tensor<string, []>("op_472_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_472_dilations_0 = const()[name = tensor<string, []>("op_472_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_472_groups_0 = const()[name = tensor<string, []>("op_472_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_0_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(15040000))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17137216))), name = tensor<string, []>("layers_0_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_0_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20283520))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23429312))), name = tensor<string, []>("layers_0_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_472_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_472_dilations_0, groups = var_472_groups_0, pad = var_472_pad_0, pad_type = var_472_pad_type_0, strides = var_472_strides_0, weight = layers_0_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_37_cast_fp16)[name = tensor<string, []>("op_472_cast_fp16")];
             tensor<string, []> var_478_pad_type_0 = const()[name = tensor<string, []>("op_478_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_478_strides_0 = const()[name = tensor<string, []>("op_478_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_478_pad_0 = const()[name = tensor<string, []>("op_478_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_478_dilations_0 = const()[name = tensor<string, []>("op_478_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_478_groups_0 = const()[name = tensor<string, []>("op_478_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_0_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17282496))), name = tensor<string, []>("layers_0_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72532]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17137344))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_0_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23574656))), name = tensor<string, []>("layers_0_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72532]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23429504))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_478_cast_fp16 = conv(dilations = var_478_dilations_0, groups = var_478_groups_0, pad = var_478_pad_0, pad_type = var_478_pad_type_0, strides = var_478_strides_0, weight = layers_0_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_37_cast_fp16)[name = tensor<string, []>("op_478_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_39_cast_fp16 = add(x = var_472_cast_fp16, y = var_478_cast_fp16)[name = tensor<string, []>("input_39_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_41_cast_fp16 = silu(x = input_39_cast_fp16)[name = tensor<string, []>("input_41_cast_fp16")];
@@ -323,14 +323,14 @@ program(1.0)
             tensor<int32, [4]> var_489_pad_0 = const()[name = tensor<string, []>("op_489_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_489_dilations_0 = const()[name = tensor<string, []>("op_489_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_489_groups_0 = const()[name = tensor<string, []>("op_489_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_0_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(17806848))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19904064))), name = tensor<string, []>("layers_0_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_0_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(24099008))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27244800))), name = tensor<string, []>("layers_0_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_489_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_489_dilations_0, groups = var_489_groups_0, pad = var_489_pad_0, pad_type = var_489_pad_type_0, strides = var_489_strides_0, weight = layers_0_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_41_cast_fp16)[name = tensor<string, []>("op_489_cast_fp16")];
             tensor<string, []> var_495_pad_type_0 = const()[name = tensor<string, []>("op_495_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_495_strides_0 = const()[name = tensor<string, []>("op_495_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_495_pad_0 = const()[name = tensor<string, []>("op_495_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_495_dilations_0 = const()[name = tensor<string, []>("op_495_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_495_groups_0 = const()[name = tensor<string, []>("op_495_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_0_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20044096))), name = tensor<string, []>("layers_0_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69907]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(19904192))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_0_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27384896))), name = tensor<string, []>("layers_0_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69907]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27244992))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_495_cast_fp16 = conv(dilations = var_495_dilations_0, groups = var_495_groups_0, pad = var_495_pad_0, pad_type = var_495_pad_type_0, strides = var_495_strides_0, weight = layers_0_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_41_cast_fp16)[name = tensor<string, []>("op_495_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_7_cast_fp16 = add(x = var_489_cast_fp16, y = var_495_cast_fp16)[name = tensor<string, []>("x_7_cast_fp16")];
             tensor<fp16, []> var_497_to_fp16 = const()[name = tensor<string, []>("op_497_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -339,16 +339,16 @@ program(1.0)
             tensor<int32, [1]> out_9_axes_0 = const()[name = tensor<string, []>("out_9_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_508_to_fp16 = const()[name = tensor<string, []>("op_508_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_9_cast_fp16 = layer_norm(axes = out_9_axes_0, epsilon = var_508_to_fp16, x = inputs_9_cast_fp16)[name = tensor<string, []>("out_9_cast_fp16")];
-            tensor<fp16, [1024]> inputs_11_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_11_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20568448)))];
-            tensor<fp16, [1024]> inputs_11_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_11_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20570560)))];
+            tensor<fp16, [1024]> inputs_11_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_11_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27909248)))];
+            tensor<fp16, [1024]> inputs_11_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_11_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27911360)))];
             tensor<fp16, []> inputs_11_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_11_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_11_cast_fp16 = batch_norm(beta = inputs_11_beta_0_to_fp16, epsilon = inputs_11_epsilon_0_to_fp16, gamma = inputs_11_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_9_cast_fp16)[name = tensor<string, []>("inputs_11_cast_fp16")];
             tensor<int32, []> var_522 = const()[name = tensor<string, []>("op_522"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_11_axes_0 = const()[name = tensor<string, []>("out_11_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_553_to_fp16 = const()[name = tensor<string, []>("op_553_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_11_cast_fp16 = layer_norm(axes = out_11_axes_0, epsilon = var_553_to_fp16, x = inputs_11_cast_fp16)[name = tensor<string, []>("out_11_cast_fp16")];
-            tensor<fp16, [1024]> input_43_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_43_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20572672)))];
-            tensor<fp16, [1024]> input_43_beta_0_to_fp16 = const()[name = tensor<string, []>("input_43_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20574784)))];
+            tensor<fp16, [1024]> input_43_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_43_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27913472)))];
+            tensor<fp16, [1024]> input_43_beta_0_to_fp16 = const()[name = tensor<string, []>("input_43_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27915584)))];
             tensor<fp16, []> input_43_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_43_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_43_cast_fp16 = batch_norm(beta = input_43_beta_0_to_fp16, epsilon = input_43_epsilon_0_to_fp16, gamma = input_43_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_11_cast_fp16)[name = tensor<string, []>("input_43_cast_fp16")];
             tensor<string, []> var_573_pad_type_0 = const()[name = tensor<string, []>("op_573_pad_type_0"), val = tensor<string, []>("valid")];
@@ -356,14 +356,14 @@ program(1.0)
             tensor<int32, [4]> var_573_pad_0 = const()[name = tensor<string, []>("op_573_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_573_dilations_0 = const()[name = tensor<string, []>("op_573_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_573_groups_0 = const()[name = tensor<string, []>("op_573_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_1_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(20576896))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22674112))), name = tensor<string, []>("layers_1_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_1_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27917696))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31063488))), name = tensor<string, []>("layers_1_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_573_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_573_dilations_0, groups = var_573_groups_0, pad = var_573_pad_0, pad_type = var_573_pad_type_0, strides = var_573_strides_0, weight = layers_1_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_43_cast_fp16)[name = tensor<string, []>("op_573_cast_fp16")];
             tensor<string, []> var_579_pad_type_0 = const()[name = tensor<string, []>("op_579_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_579_strides_0 = const()[name = tensor<string, []>("op_579_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_579_pad_0 = const()[name = tensor<string, []>("op_579_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_579_dilations_0 = const()[name = tensor<string, []>("op_579_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_579_groups_0 = const()[name = tensor<string, []>("op_579_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_1_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22824576))), name = tensor<string, []>("layers_1_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75124]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(22674240))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_1_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31214016))), name = tensor<string, []>("layers_1_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75124]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31063680))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_579_cast_fp16 = conv(dilations = var_579_dilations_0, groups = var_579_groups_0, pad = var_579_pad_0, pad_type = var_579_pad_type_0, strides = var_579_strides_0, weight = layers_1_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_43_cast_fp16)[name = tensor<string, []>("op_579_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_45_cast_fp16 = add(x = var_573_cast_fp16, y = var_579_cast_fp16)[name = tensor<string, []>("input_45_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_47_cast_fp16 = silu(x = input_45_cast_fp16)[name = tensor<string, []>("input_47_cast_fp16")];
@@ -372,14 +372,14 @@ program(1.0)
             tensor<int32, [4]> var_590_pad_0 = const()[name = tensor<string, []>("op_590_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_590_dilations_0 = const()[name = tensor<string, []>("op_590_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_590_groups_0 = const()[name = tensor<string, []>("op_590_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_1_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(23348928))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25446144))), name = tensor<string, []>("layers_1_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_1_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31738368))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(34884160))), name = tensor<string, []>("layers_1_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_590_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_590_dilations_0, groups = var_590_groups_0, pad = var_590_pad_0, pad_type = var_590_pad_type_0, strides = var_590_strides_0, weight = layers_1_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_47_cast_fp16)[name = tensor<string, []>("op_590_cast_fp16")];
             tensor<string, []> var_596_pad_type_0 = const()[name = tensor<string, []>("op_596_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_596_strides_0 = const()[name = tensor<string, []>("op_596_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_596_pad_0 = const()[name = tensor<string, []>("op_596_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_596_dilations_0 = const()[name = tensor<string, []>("op_596_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_596_groups_0 = const()[name = tensor<string, []>("op_596_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_1_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25630464))), name = tensor<string, []>("layers_1_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [92059]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(25446272))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_1_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35068544))), name = tensor<string, []>("layers_1_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [92059]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(34884352))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_596_cast_fp16 = conv(dilations = var_596_dilations_0, groups = var_596_groups_0, pad = var_596_pad_0, pad_type = var_596_pad_type_0, strides = var_596_strides_0, weight = layers_1_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_47_cast_fp16)[name = tensor<string, []>("op_596_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_9_cast_fp16 = add(x = var_590_cast_fp16, y = var_596_cast_fp16)[name = tensor<string, []>("x_9_cast_fp16")];
             tensor<fp16, []> var_598_to_fp16 = const()[name = tensor<string, []>("op_598_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -388,8 +388,8 @@ program(1.0)
             tensor<int32, [1]> out_13_axes_0 = const()[name = tensor<string, []>("out_13_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_609_to_fp16 = const()[name = tensor<string, []>("op_609_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_13_cast_fp16 = layer_norm(axes = out_13_axes_0, epsilon = var_609_to_fp16, x = inputs_13_cast_fp16)[name = tensor<string, []>("out_13_cast_fp16")];
-            tensor<fp16, [1024]> obj_7_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_7_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26154816)))];
-            tensor<fp16, [1024]> obj_7_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_7_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26156928)))];
+            tensor<fp16, [1024]> obj_7_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_7_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35592896)))];
+            tensor<fp16, [1024]> obj_7_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_7_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35595008)))];
             tensor<fp16, []> obj_7_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_7_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_7_cast_fp16 = batch_norm(beta = obj_7_beta_0_to_fp16, epsilon = obj_7_epsilon_0_to_fp16, gamma = obj_7_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_13_cast_fp16)[name = tensor<string, []>("obj_7_cast_fp16")];
             tensor<string, []> var_634_pad_type_0 = const()[name = tensor<string, []>("op_634_pad_type_0"), val = tensor<string, []>("valid")];
@@ -397,14 +397,14 @@ program(1.0)
             tensor<int32, [4]> var_634_pad_0 = const()[name = tensor<string, []>("op_634_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_634_dilations_0 = const()[name = tensor<string, []>("op_634_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_634_groups_0 = const()[name = tensor<string, []>("op_634_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26159040))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26683392))), name = tensor<string, []>("layers_1_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(35597120))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(36383616))), name = tensor<string, []>("layers_1_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_634_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_634_dilations_0, groups = var_634_groups_0, pad = var_634_pad_0, pad_type = var_634_pad_type_0, strides = var_634_strides_0, weight = layers_1_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_7_cast_fp16)[name = tensor<string, []>("op_634_cast_fp16")];
             tensor<string, []> var_640_pad_type_0 = const()[name = tensor<string, []>("op_640_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_640_strides_0 = const()[name = tensor<string, []>("op_640_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_640_pad_0 = const()[name = tensor<string, []>("op_640_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_640_dilations_0 = const()[name = tensor<string, []>("op_640_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_640_groups_0 = const()[name = tensor<string, []>("op_640_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26720704))), name = tensor<string, []>("layers_1_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18543]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26683520))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(36420992))), name = tensor<string, []>("layers_1_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18543]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(36383808))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_640_cast_fp16 = conv(dilations = var_640_dilations_0, groups = var_640_groups_0, pad = var_640_pad_0, pad_type = var_640_pad_type_0, strides = var_640_strides_0, weight = layers_1_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_7_cast_fp16)[name = tensor<string, []>("op_640_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_5_cast_fp16 = add(x = var_634_cast_fp16, y = var_640_cast_fp16)[name = tensor<string, []>("query_5_cast_fp16")];
             tensor<string, []> var_649_pad_type_0 = const()[name = tensor<string, []>("op_649_pad_type_0"), val = tensor<string, []>("valid")];
@@ -412,14 +412,14 @@ program(1.0)
             tensor<int32, [4]> var_649_pad_0 = const()[name = tensor<string, []>("op_649_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_649_dilations_0 = const()[name = tensor<string, []>("op_649_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_649_groups_0 = const()[name = tensor<string, []>("op_649_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(26851840))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27376192))), name = tensor<string, []>("layers_1_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(36552128))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37338624))), name = tensor<string, []>("layers_1_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_649_cast_fp16 = conv(dilations = var_649_dilations_0, groups = var_649_groups_0, pad = var_649_pad_0, pad_type = var_649_pad_type_0, strides = var_649_strides_0, weight = layers_1_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_7_cast_fp16)[name = tensor<string, []>("op_649_cast_fp16")];
             tensor<string, []> var_655_pad_type_0 = const()[name = tensor<string, []>("op_655_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_655_strides_0 = const()[name = tensor<string, []>("op_655_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_655_pad_0 = const()[name = tensor<string, []>("op_655_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_655_dilations_0 = const()[name = tensor<string, []>("op_655_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_655_groups_0 = const()[name = tensor<string, []>("op_655_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27423296))), name = tensor<string, []>("layers_1_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [23456]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27376320))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37385792))), name = tensor<string, []>("layers_1_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [23456]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37338816))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_655_cast_fp16 = conv(dilations = var_655_dilations_0, groups = var_655_groups_0, pad = var_655_pad_0, pad_type = var_655_pad_type_0, strides = var_655_strides_0, weight = layers_1_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_7_cast_fp16)[name = tensor<string, []>("op_655_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_3_cast_fp16 = add(x = var_649_cast_fp16, y = var_655_cast_fp16)[name = tensor<string, []>("key_3_cast_fp16")];
             tensor<string, []> var_665_pad_type_0 = const()[name = tensor<string, []>("op_665_pad_type_0"), val = tensor<string, []>("valid")];
@@ -427,33 +427,33 @@ program(1.0)
             tensor<int32, [4]> var_665_pad_0 = const()[name = tensor<string, []>("op_665_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_665_dilations_0 = const()[name = tensor<string, []>("op_665_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_665_groups_0 = const()[name = tensor<string, []>("op_665_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(27554432))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28078784))), name = tensor<string, []>("layers_1_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37516928))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38303424))), name = tensor<string, []>("layers_1_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_665_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_665_dilations_0, groups = var_665_groups_0, pad = var_665_pad_0, pad_type = var_665_pad_type_0, strides = var_665_strides_0, weight = layers_1_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_7_cast_fp16)[name = tensor<string, []>("op_665_cast_fp16")];
             tensor<string, []> var_671_pad_type_0 = const()[name = tensor<string, []>("op_671_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_671_strides_0 = const()[name = tensor<string, []>("op_671_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_671_pad_0 = const()[name = tensor<string, []>("op_671_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_671_dilations_0 = const()[name = tensor<string, []>("op_671_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_671_groups_0 = const()[name = tensor<string, []>("op_671_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28114240))), name = tensor<string, []>("layers_1_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17605]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28078912))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38338944))), name = tensor<string, []>("layers_1_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17605]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38303616))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_671_cast_fp16 = conv(dilations = var_671_dilations_0, groups = var_671_groups_0, pad = var_671_pad_0, pad_type = var_671_pad_type_0, strides = var_671_strides_0, weight = layers_1_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_7_cast_fp16)[name = tensor<string, []>("op_671_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_3_cast_fp16 = add(x = var_665_cast_fp16, y = var_671_cast_fp16)[name = tensor<string, []>("value_3_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_674_to_fp16 = const()[name = tensor<string, []>("op_674_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28245376)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_674_to_fp16 = const()[name = tensor<string, []>("op_674_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38470080)))];
             tensor<fp16, [1, 1024, 1, 188]> query_7_cast_fp16 = add(x = query_5_cast_fp16, y = var_674_to_fp16)[name = tensor<string, []>("query_7_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_677_to_fp16 = const()[name = tensor<string, []>("op_677_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28247488)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_677_to_fp16 = const()[name = tensor<string, []>("op_677_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38472192)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_3_cast_fp16 = add(x = query_5_cast_fp16, y = var_677_to_fp16)[name = tensor<string, []>("q_with_bias_v_3_cast_fp16")];
             tensor<string, []> var_687_pad_type_0 = const()[name = tensor<string, []>("op_687_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_687_strides_0 = const()[name = tensor<string, []>("op_687_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_687_pad_0 = const()[name = tensor<string, []>("op_687_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_687_dilations_0 = const()[name = tensor<string, []>("op_687_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_687_groups_0 = const()[name = tensor<string, []>("op_687_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28249600))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28773952))), name = tensor<string, []>("layers_1_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(38474304))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39260800))), name = tensor<string, []>("layers_1_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_687_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_687_dilations_0, groups = var_687_groups_0, pad = var_687_pad_0, pad_type = var_687_pad_type_0, strides = var_687_strides_0, weight = layers_1_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_687_cast_fp16")];
             tensor<string, []> var_693_pad_type_0 = const()[name = tensor<string, []>("op_693_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_693_strides_0 = const()[name = tensor<string, []>("op_693_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_693_pad_0 = const()[name = tensor<string, []>("op_693_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_693_dilations_0 = const()[name = tensor<string, []>("op_693_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_693_groups_0 = const()[name = tensor<string, []>("op_693_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28853120))), name = tensor<string, []>("layers_1_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [39474]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28774080))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39340032))), name = tensor<string, []>("layers_1_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [39474]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39260992))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_693_cast_fp16 = conv(dilations = var_693_dilations_0, groups = var_693_groups_0, pad = var_693_pad_0, pad_type = var_693_pad_type_0, strides = var_693_strides_0, weight = layers_1_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_693_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_3_cast_fp16 = add(x = var_687_cast_fp16, y = var_693_cast_fp16)[name = tensor<string, []>("p_3_cast_fp16")];
             tensor<int32, [4]> var_697 = const()[name = tensor<string, []>("op_697"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -504,22 +504,22 @@ program(1.0)
             tensor<int32, [4]> var_750_pad_0 = const()[name = tensor<string, []>("op_750_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_750_dilations_0 = const()[name = tensor<string, []>("op_750_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_750_groups_0 = const()[name = tensor<string, []>("op_750_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(28984256))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29508608))), name = tensor<string, []>("layers_1_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39471168))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40257664))), name = tensor<string, []>("layers_1_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_750_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_750_dilations_0, groups = var_750_groups_0, pad = var_750_pad_0, pad_type = var_750_pad_type_0, strides = var_750_strides_0, weight = layers_1_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_49_cast_fp16)[name = tensor<string, []>("op_750_cast_fp16")];
             tensor<string, []> var_756_pad_type_0 = const()[name = tensor<string, []>("op_756_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_756_strides_0 = const()[name = tensor<string, []>("op_756_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_756_pad_0 = const()[name = tensor<string, []>("op_756_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_756_dilations_0 = const()[name = tensor<string, []>("op_756_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_756_groups_0 = const()[name = tensor<string, []>("op_756_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29543104))), name = tensor<string, []>("layers_1_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17123]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29508736))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40292224))), name = tensor<string, []>("layers_1_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17123]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40257856))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_756_cast_fp16 = conv(dilations = var_756_dilations_0, groups = var_756_groups_0, pad = var_756_pad_0, pad_type = var_756_pad_type_0, strides = var_756_strides_0, weight = layers_1_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_49_cast_fp16)[name = tensor<string, []>("op_756_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_9_cast_fp16 = add(x = var_750_cast_fp16, y = var_756_cast_fp16)[name = tensor<string, []>("obj_9_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_15_cast_fp16 = add(x = inputs_13_cast_fp16, y = obj_9_cast_fp16)[name = tensor<string, []>("inputs_15_cast_fp16")];
             tensor<int32, [1]> out_15_axes_0 = const()[name = tensor<string, []>("out_15_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_767_to_fp16 = const()[name = tensor<string, []>("op_767_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_15_cast_fp16 = layer_norm(axes = out_15_axes_0, epsilon = var_767_to_fp16, x = inputs_15_cast_fp16)[name = tensor<string, []>("out_15_cast_fp16")];
-            tensor<fp16, [1024]> input_51_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_51_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29674240)))];
-            tensor<fp16, [1024]> input_51_beta_0_to_fp16 = const()[name = tensor<string, []>("input_51_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29676352)))];
+            tensor<fp16, [1024]> input_51_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_51_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40423360)))];
+            tensor<fp16, [1024]> input_51_beta_0_to_fp16 = const()[name = tensor<string, []>("input_51_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40425472)))];
             tensor<fp16, []> input_51_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_51_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_51_cast_fp16 = batch_norm(beta = input_51_beta_0_to_fp16, epsilon = input_51_epsilon_0_to_fp16, gamma = input_51_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_15_cast_fp16)[name = tensor<string, []>("input_51_cast_fp16")];
             tensor<string, []> var_788_pad_type_0 = const()[name = tensor<string, []>("op_788_pad_type_0"), val = tensor<string, []>("valid")];
@@ -527,14 +527,14 @@ program(1.0)
             tensor<int32, [4]> var_788_pad_0 = const()[name = tensor<string, []>("op_788_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_788_dilations_0 = const()[name = tensor<string, []>("op_788_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_788_groups_0 = const()[name = tensor<string, []>("op_788_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_1_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(29678464))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30727104))), name = tensor<string, []>("layers_1_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_1_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40427584))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42000512))), name = tensor<string, []>("layers_1_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_788_cast_fp16 = conv(dilations = var_788_dilations_0, groups = var_788_groups_0, pad = var_788_pad_0, pad_type = var_788_pad_type_0, strides = var_788_strides_0, weight = layers_1_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_51_cast_fp16)[name = tensor<string, []>("op_788_cast_fp16")];
             tensor<string, []> var_794_pad_type_0 = const()[name = tensor<string, []>("op_794_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_794_strides_0 = const()[name = tensor<string, []>("op_794_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_794_pad_0 = const()[name = tensor<string, []>("op_794_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_794_dilations_0 = const()[name = tensor<string, []>("op_794_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_794_groups_0 = const()[name = tensor<string, []>("op_794_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_1_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30795200))), name = tensor<string, []>("layers_1_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33947]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(30727232))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_1_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42068672))), name = tensor<string, []>("layers_1_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33947]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42000704))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_794_cast_fp16 = conv(dilations = var_794_dilations_0, groups = var_794_groups_0, pad = var_794_pad_0, pad_type = var_794_pad_type_0, strides = var_794_strides_0, weight = layers_1_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_51_cast_fp16)[name = tensor<string, []>("op_794_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_53_cast_fp16 = add(x = var_788_cast_fp16, y = var_794_cast_fp16)[name = tensor<string, []>("input_53_cast_fp16")];
             tensor<int32, []> input_55_split_num_splits_0 = const()[name = tensor<string, []>("input_55_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -547,8 +547,8 @@ program(1.0)
             tensor<int32, []> input_57_groups_0 = const()[name = tensor<string, []>("input_57_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_57_strides_0 = const()[name = tensor<string, []>("input_57_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_57_dilations_0 = const()[name = tensor<string, []>("input_57_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_270_to_fp16 = const()[name = tensor<string, []>("const_270_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31057408)))];
-            tensor<fp16, [1024]> const_271_to_fp16 = const()[name = tensor<string, []>("const_271_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31075904)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_270_to_fp16 = const()[name = tensor<string, []>("const_270_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42330880)))];
+            tensor<fp16, [1024]> const_271_to_fp16 = const()[name = tensor<string, []>("const_271_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42349376)))];
             tensor<fp16, [1, 1024, 1, 188]> input_59_cast_fp16 = conv(bias = const_271_to_fp16, dilations = input_57_dilations_0, groups = input_57_groups_0, pad = input_57_pad_0, pad_type = input_57_pad_type_0, strides = input_57_strides_0, weight = const_270_to_fp16, x = input_55_cast_fp16)[name = tensor<string, []>("input_59_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_61_cast_fp16 = silu(x = input_59_cast_fp16)[name = tensor<string, []>("input_61_cast_fp16")];
             tensor<string, []> var_816_pad_type_0 = const()[name = tensor<string, []>("op_816_pad_type_0"), val = tensor<string, []>("valid")];
@@ -556,22 +556,22 @@ program(1.0)
             tensor<int32, [4]> var_816_pad_0 = const()[name = tensor<string, []>("op_816_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_816_dilations_0 = const()[name = tensor<string, []>("op_816_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_816_groups_0 = const()[name = tensor<string, []>("op_816_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31078016))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31602368))), name = tensor<string, []>("layers_1_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42351488))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43137984))), name = tensor<string, []>("layers_1_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_816_cast_fp16 = conv(dilations = var_816_dilations_0, groups = var_816_groups_0, pad = var_816_pad_0, pad_type = var_816_pad_type_0, strides = var_816_strides_0, weight = layers_1_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_61_cast_fp16)[name = tensor<string, []>("op_816_cast_fp16")];
             tensor<string, []> var_822_pad_type_0 = const()[name = tensor<string, []>("op_822_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_822_strides_0 = const()[name = tensor<string, []>("op_822_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_822_pad_0 = const()[name = tensor<string, []>("op_822_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_822_dilations_0 = const()[name = tensor<string, []>("op_822_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_822_groups_0 = const()[name = tensor<string, []>("op_822_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_1_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31637696))), name = tensor<string, []>("layers_1_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17549]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31602496))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_1_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43173376))), name = tensor<string, []>("layers_1_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17549]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43138176))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_822_cast_fp16 = conv(dilations = var_822_dilations_0, groups = var_822_groups_0, pad = var_822_pad_0, pad_type = var_822_pad_type_0, strides = var_822_strides_0, weight = layers_1_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_61_cast_fp16)[name = tensor<string, []>("op_822_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_11_cast_fp16 = add(x = var_816_cast_fp16, y = var_822_cast_fp16)[name = tensor<string, []>("x_11_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_17_cast_fp16 = add(x = inputs_15_cast_fp16, y = x_11_cast_fp16)[name = tensor<string, []>("inputs_17_cast_fp16")];
             tensor<int32, [1]> out_17_axes_0 = const()[name = tensor<string, []>("out_17_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_833_to_fp16 = const()[name = tensor<string, []>("op_833_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_17_cast_fp16 = layer_norm(axes = out_17_axes_0, epsilon = var_833_to_fp16, x = inputs_17_cast_fp16)[name = tensor<string, []>("out_17_cast_fp16")];
-            tensor<fp16, [1024]> input_63_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_63_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31768832)))];
-            tensor<fp16, [1024]> input_63_beta_0_to_fp16 = const()[name = tensor<string, []>("input_63_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31770944)))];
+            tensor<fp16, [1024]> input_63_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_63_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43304512)))];
+            tensor<fp16, [1024]> input_63_beta_0_to_fp16 = const()[name = tensor<string, []>("input_63_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43306624)))];
             tensor<fp16, []> input_63_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_63_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_63_cast_fp16 = batch_norm(beta = input_63_beta_0_to_fp16, epsilon = input_63_epsilon_0_to_fp16, gamma = input_63_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_17_cast_fp16)[name = tensor<string, []>("input_63_cast_fp16")];
             tensor<string, []> var_853_pad_type_0 = const()[name = tensor<string, []>("op_853_pad_type_0"), val = tensor<string, []>("valid")];
@@ -579,14 +579,14 @@ program(1.0)
             tensor<int32, [4]> var_853_pad_0 = const()[name = tensor<string, []>("op_853_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_853_dilations_0 = const()[name = tensor<string, []>("op_853_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_853_groups_0 = const()[name = tensor<string, []>("op_853_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_1_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(31773056))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33870272))), name = tensor<string, []>("layers_1_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_1_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43308736))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(46454528))), name = tensor<string, []>("layers_1_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_853_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_853_dilations_0, groups = var_853_groups_0, pad = var_853_pad_0, pad_type = var_853_pad_type_0, strides = var_853_strides_0, weight = layers_1_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_63_cast_fp16)[name = tensor<string, []>("op_853_cast_fp16")];
             tensor<string, []> var_859_pad_type_0 = const()[name = tensor<string, []>("op_859_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_859_strides_0 = const()[name = tensor<string, []>("op_859_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_859_pad_0 = const()[name = tensor<string, []>("op_859_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_859_dilations_0 = const()[name = tensor<string, []>("op_859_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_859_groups_0 = const()[name = tensor<string, []>("op_859_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_1_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(34032832))), name = tensor<string, []>("layers_1_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81175]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(33870400))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_1_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(46617152))), name = tensor<string, []>("layers_1_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81175]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(46454720))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_859_cast_fp16 = conv(dilations = var_859_dilations_0, groups = var_859_groups_0, pad = var_859_pad_0, pad_type = var_859_pad_type_0, strides = var_859_strides_0, weight = layers_1_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_63_cast_fp16)[name = tensor<string, []>("op_859_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_65_cast_fp16 = add(x = var_853_cast_fp16, y = var_859_cast_fp16)[name = tensor<string, []>("input_65_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_67_cast_fp16 = silu(x = input_65_cast_fp16)[name = tensor<string, []>("input_67_cast_fp16")];
@@ -595,14 +595,14 @@ program(1.0)
             tensor<int32, [4]> var_870_pad_0 = const()[name = tensor<string, []>("op_870_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_870_dilations_0 = const()[name = tensor<string, []>("op_870_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_870_groups_0 = const()[name = tensor<string, []>("op_870_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_1_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(34557184))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(36654400))), name = tensor<string, []>("layers_1_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_1_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(47141504))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50287296))), name = tensor<string, []>("layers_1_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_870_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_870_dilations_0, groups = var_870_groups_0, pad = var_870_pad_0, pad_type = var_870_pad_type_0, strides = var_870_strides_0, weight = layers_1_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_67_cast_fp16)[name = tensor<string, []>("op_870_cast_fp16")];
             tensor<string, []> var_876_pad_type_0 = const()[name = tensor<string, []>("op_876_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_876_strides_0 = const()[name = tensor<string, []>("op_876_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_876_pad_0 = const()[name = tensor<string, []>("op_876_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_876_dilations_0 = const()[name = tensor<string, []>("op_876_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_876_groups_0 = const()[name = tensor<string, []>("op_876_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_1_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(36837376))), name = tensor<string, []>("layers_1_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [91372]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(36654528))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_1_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50470336))), name = tensor<string, []>("layers_1_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [91372]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50287488))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_876_cast_fp16 = conv(dilations = var_876_dilations_0, groups = var_876_groups_0, pad = var_876_pad_0, pad_type = var_876_pad_type_0, strides = var_876_strides_0, weight = layers_1_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_67_cast_fp16)[name = tensor<string, []>("op_876_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_13_cast_fp16 = add(x = var_870_cast_fp16, y = var_876_cast_fp16)[name = tensor<string, []>("x_13_cast_fp16")];
             tensor<fp16, []> var_878_to_fp16 = const()[name = tensor<string, []>("op_878_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -611,16 +611,16 @@ program(1.0)
             tensor<int32, [1]> out_19_axes_0 = const()[name = tensor<string, []>("out_19_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_889_to_fp16 = const()[name = tensor<string, []>("op_889_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_19_cast_fp16 = layer_norm(axes = out_19_axes_0, epsilon = var_889_to_fp16, x = inputs_19_cast_fp16)[name = tensor<string, []>("out_19_cast_fp16")];
-            tensor<fp16, [1024]> inputs_21_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_21_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37361728)))];
-            tensor<fp16, [1024]> inputs_21_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_21_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37363840)))];
+            tensor<fp16, [1024]> inputs_21_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_21_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50994688)))];
+            tensor<fp16, [1024]> inputs_21_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_21_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50996800)))];
             tensor<fp16, []> inputs_21_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_21_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_21_cast_fp16 = batch_norm(beta = inputs_21_beta_0_to_fp16, epsilon = inputs_21_epsilon_0_to_fp16, gamma = inputs_21_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_19_cast_fp16)[name = tensor<string, []>("inputs_21_cast_fp16")];
             tensor<int32, []> var_903 = const()[name = tensor<string, []>("op_903"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_21_axes_0 = const()[name = tensor<string, []>("out_21_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_934_to_fp16 = const()[name = tensor<string, []>("op_934_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_21_cast_fp16 = layer_norm(axes = out_21_axes_0, epsilon = var_934_to_fp16, x = inputs_21_cast_fp16)[name = tensor<string, []>("out_21_cast_fp16")];
-            tensor<fp16, [1024]> input_69_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_69_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37365952)))];
-            tensor<fp16, [1024]> input_69_beta_0_to_fp16 = const()[name = tensor<string, []>("input_69_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37368064)))];
+            tensor<fp16, [1024]> input_69_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_69_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50998912)))];
+            tensor<fp16, [1024]> input_69_beta_0_to_fp16 = const()[name = tensor<string, []>("input_69_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(51001024)))];
             tensor<fp16, []> input_69_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_69_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_69_cast_fp16 = batch_norm(beta = input_69_beta_0_to_fp16, epsilon = input_69_epsilon_0_to_fp16, gamma = input_69_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_21_cast_fp16)[name = tensor<string, []>("input_69_cast_fp16")];
             tensor<string, []> var_954_pad_type_0 = const()[name = tensor<string, []>("op_954_pad_type_0"), val = tensor<string, []>("valid")];
@@ -628,14 +628,14 @@ program(1.0)
             tensor<int32, [4]> var_954_pad_0 = const()[name = tensor<string, []>("op_954_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_954_dilations_0 = const()[name = tensor<string, []>("op_954_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_954_groups_0 = const()[name = tensor<string, []>("op_954_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_2_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(37370176))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39467392))), name = tensor<string, []>("layers_2_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_2_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(51003136))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(54148928))), name = tensor<string, []>("layers_2_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_954_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_954_dilations_0, groups = var_954_groups_0, pad = var_954_pad_0, pad_type = var_954_pad_type_0, strides = var_954_strides_0, weight = layers_2_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_69_cast_fp16)[name = tensor<string, []>("op_954_cast_fp16")];
             tensor<string, []> var_960_pad_type_0 = const()[name = tensor<string, []>("op_960_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_960_strides_0 = const()[name = tensor<string, []>("op_960_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_960_pad_0 = const()[name = tensor<string, []>("op_960_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_960_dilations_0 = const()[name = tensor<string, []>("op_960_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_960_groups_0 = const()[name = tensor<string, []>("op_960_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_2_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39599936))), name = tensor<string, []>("layers_2_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [66161]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(39467520))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_2_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(54281536))), name = tensor<string, []>("layers_2_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [66161]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(54149120))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_960_cast_fp16 = conv(dilations = var_960_dilations_0, groups = var_960_groups_0, pad = var_960_pad_0, pad_type = var_960_pad_type_0, strides = var_960_strides_0, weight = layers_2_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_69_cast_fp16)[name = tensor<string, []>("op_960_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_71_cast_fp16 = add(x = var_954_cast_fp16, y = var_960_cast_fp16)[name = tensor<string, []>("input_71_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_73_cast_fp16 = silu(x = input_71_cast_fp16)[name = tensor<string, []>("input_73_cast_fp16")];
@@ -644,14 +644,14 @@ program(1.0)
             tensor<int32, [4]> var_971_pad_0 = const()[name = tensor<string, []>("op_971_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_971_dilations_0 = const()[name = tensor<string, []>("op_971_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_971_groups_0 = const()[name = tensor<string, []>("op_971_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_2_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(40124288))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42221504))), name = tensor<string, []>("layers_2_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_2_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(54805888))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(57951680))), name = tensor<string, []>("layers_2_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_971_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_971_dilations_0, groups = var_971_groups_0, pad = var_971_pad_0, pad_type = var_971_pad_type_0, strides = var_971_strides_0, weight = layers_2_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_73_cast_fp16)[name = tensor<string, []>("op_971_cast_fp16")];
             tensor<string, []> var_977_pad_type_0 = const()[name = tensor<string, []>("op_977_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_977_strides_0 = const()[name = tensor<string, []>("op_977_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_977_pad_0 = const()[name = tensor<string, []>("op_977_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_977_dilations_0 = const()[name = tensor<string, []>("op_977_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_977_groups_0 = const()[name = tensor<string, []>("op_977_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_2_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42382848))), name = tensor<string, []>("layers_2_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [80551]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42221632))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_2_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(58113088))), name = tensor<string, []>("layers_2_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [80551]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(57951872))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_977_cast_fp16 = conv(dilations = var_977_dilations_0, groups = var_977_groups_0, pad = var_977_pad_0, pad_type = var_977_pad_type_0, strides = var_977_strides_0, weight = layers_2_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_73_cast_fp16)[name = tensor<string, []>("op_977_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_15_cast_fp16 = add(x = var_971_cast_fp16, y = var_977_cast_fp16)[name = tensor<string, []>("x_15_cast_fp16")];
             tensor<fp16, []> var_979_to_fp16 = const()[name = tensor<string, []>("op_979_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -660,8 +660,8 @@ program(1.0)
             tensor<int32, [1]> out_23_axes_0 = const()[name = tensor<string, []>("out_23_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_990_to_fp16 = const()[name = tensor<string, []>("op_990_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_23_cast_fp16 = layer_norm(axes = out_23_axes_0, epsilon = var_990_to_fp16, x = inputs_23_cast_fp16)[name = tensor<string, []>("out_23_cast_fp16")];
-            tensor<fp16, [1024]> obj_11_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_11_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42907200)))];
-            tensor<fp16, [1024]> obj_11_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_11_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42909312)))];
+            tensor<fp16, [1024]> obj_11_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_11_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(58637440)))];
+            tensor<fp16, [1024]> obj_11_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_11_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(58639552)))];
             tensor<fp16, []> obj_11_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_11_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_11_cast_fp16 = batch_norm(beta = obj_11_beta_0_to_fp16, epsilon = obj_11_epsilon_0_to_fp16, gamma = obj_11_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_23_cast_fp16)[name = tensor<string, []>("obj_11_cast_fp16")];
             tensor<string, []> var_1015_pad_type_0 = const()[name = tensor<string, []>("op_1015_pad_type_0"), val = tensor<string, []>("valid")];
@@ -669,14 +669,14 @@ program(1.0)
             tensor<int32, [4]> var_1015_pad_0 = const()[name = tensor<string, []>("op_1015_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1015_dilations_0 = const()[name = tensor<string, []>("op_1015_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1015_groups_0 = const()[name = tensor<string, []>("op_1015_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(42911424))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43435776))), name = tensor<string, []>("layers_2_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(58641664))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(59428160))), name = tensor<string, []>("layers_2_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1015_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1015_dilations_0, groups = var_1015_groups_0, pad = var_1015_pad_0, pad_type = var_1015_pad_type_0, strides = var_1015_strides_0, weight = layers_2_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_11_cast_fp16)[name = tensor<string, []>("op_1015_cast_fp16")];
             tensor<string, []> var_1021_pad_type_0 = const()[name = tensor<string, []>("op_1021_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1021_strides_0 = const()[name = tensor<string, []>("op_1021_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1021_pad_0 = const()[name = tensor<string, []>("op_1021_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1021_dilations_0 = const()[name = tensor<string, []>("op_1021_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1021_groups_0 = const()[name = tensor<string, []>("op_1021_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43473408))), name = tensor<string, []>("layers_2_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18693]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43435904))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(59465856))), name = tensor<string, []>("layers_2_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18693]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(59428352))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1021_cast_fp16 = conv(dilations = var_1021_dilations_0, groups = var_1021_groups_0, pad = var_1021_pad_0, pad_type = var_1021_pad_type_0, strides = var_1021_strides_0, weight = layers_2_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_11_cast_fp16)[name = tensor<string, []>("op_1021_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_9_cast_fp16 = add(x = var_1015_cast_fp16, y = var_1021_cast_fp16)[name = tensor<string, []>("query_9_cast_fp16")];
             tensor<string, []> var_1030_pad_type_0 = const()[name = tensor<string, []>("op_1030_pad_type_0"), val = tensor<string, []>("valid")];
@@ -684,14 +684,14 @@ program(1.0)
             tensor<int32, [4]> var_1030_pad_0 = const()[name = tensor<string, []>("op_1030_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1030_dilations_0 = const()[name = tensor<string, []>("op_1030_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1030_groups_0 = const()[name = tensor<string, []>("op_1030_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(43604544))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44128896))), name = tensor<string, []>("layers_2_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(59596992))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60383488))), name = tensor<string, []>("layers_2_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1030_cast_fp16 = conv(dilations = var_1030_dilations_0, groups = var_1030_groups_0, pad = var_1030_pad_0, pad_type = var_1030_pad_type_0, strides = var_1030_strides_0, weight = layers_2_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_11_cast_fp16)[name = tensor<string, []>("op_1030_cast_fp16")];
             tensor<string, []> var_1036_pad_type_0 = const()[name = tensor<string, []>("op_1036_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1036_strides_0 = const()[name = tensor<string, []>("op_1036_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1036_pad_0 = const()[name = tensor<string, []>("op_1036_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1036_dilations_0 = const()[name = tensor<string, []>("op_1036_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1036_groups_0 = const()[name = tensor<string, []>("op_1036_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44169984))), name = tensor<string, []>("layers_2_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [20442]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44129024))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60424640))), name = tensor<string, []>("layers_2_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [20442]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60383680))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1036_cast_fp16 = conv(dilations = var_1036_dilations_0, groups = var_1036_groups_0, pad = var_1036_pad_0, pad_type = var_1036_pad_type_0, strides = var_1036_strides_0, weight = layers_2_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_11_cast_fp16)[name = tensor<string, []>("op_1036_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_5_cast_fp16 = add(x = var_1030_cast_fp16, y = var_1036_cast_fp16)[name = tensor<string, []>("key_5_cast_fp16")];
             tensor<string, []> var_1046_pad_type_0 = const()[name = tensor<string, []>("op_1046_pad_type_0"), val = tensor<string, []>("valid")];
@@ -699,33 +699,33 @@ program(1.0)
             tensor<int32, [4]> var_1046_pad_0 = const()[name = tensor<string, []>("op_1046_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1046_dilations_0 = const()[name = tensor<string, []>("op_1046_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1046_groups_0 = const()[name = tensor<string, []>("op_1046_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44301120))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44825472))), name = tensor<string, []>("layers_2_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60555776))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61342272))), name = tensor<string, []>("layers_2_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1046_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1046_dilations_0, groups = var_1046_groups_0, pad = var_1046_pad_0, pad_type = var_1046_pad_type_0, strides = var_1046_strides_0, weight = layers_2_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_11_cast_fp16)[name = tensor<string, []>("op_1046_cast_fp16")];
             tensor<string, []> var_1052_pad_type_0 = const()[name = tensor<string, []>("op_1052_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1052_strides_0 = const()[name = tensor<string, []>("op_1052_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1052_pad_0 = const()[name = tensor<string, []>("op_1052_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1052_dilations_0 = const()[name = tensor<string, []>("op_1052_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1052_groups_0 = const()[name = tensor<string, []>("op_1052_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44857152))), name = tensor<string, []>("layers_2_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15713]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44825600))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61374016))), name = tensor<string, []>("layers_2_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15713]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61342464))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1052_cast_fp16 = conv(dilations = var_1052_dilations_0, groups = var_1052_groups_0, pad = var_1052_pad_0, pad_type = var_1052_pad_type_0, strides = var_1052_strides_0, weight = layers_2_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_11_cast_fp16)[name = tensor<string, []>("op_1052_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_5_cast_fp16 = add(x = var_1046_cast_fp16, y = var_1052_cast_fp16)[name = tensor<string, []>("value_5_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_1055_to_fp16 = const()[name = tensor<string, []>("op_1055_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44988288)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1055_to_fp16 = const()[name = tensor<string, []>("op_1055_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61505152)))];
             tensor<fp16, [1, 1024, 1, 188]> query_11_cast_fp16 = add(x = query_9_cast_fp16, y = var_1055_to_fp16)[name = tensor<string, []>("query_11_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_1058_to_fp16 = const()[name = tensor<string, []>("op_1058_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44990400)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1058_to_fp16 = const()[name = tensor<string, []>("op_1058_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61507264)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_5_cast_fp16 = add(x = query_9_cast_fp16, y = var_1058_to_fp16)[name = tensor<string, []>("q_with_bias_v_5_cast_fp16")];
             tensor<string, []> var_1068_pad_type_0 = const()[name = tensor<string, []>("op_1068_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1068_strides_0 = const()[name = tensor<string, []>("op_1068_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1068_pad_0 = const()[name = tensor<string, []>("op_1068_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1068_dilations_0 = const()[name = tensor<string, []>("op_1068_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1068_groups_0 = const()[name = tensor<string, []>("op_1068_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(44992512))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(45516864))), name = tensor<string, []>("layers_2_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61509376))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(62295872))), name = tensor<string, []>("layers_2_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_1068_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1068_dilations_0, groups = var_1068_groups_0, pad = var_1068_pad_0, pad_type = var_1068_pad_type_0, strides = var_1068_strides_0, weight = layers_2_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_1068_cast_fp16")];
             tensor<string, []> var_1074_pad_type_0 = const()[name = tensor<string, []>("op_1074_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1074_strides_0 = const()[name = tensor<string, []>("op_1074_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1074_pad_0 = const()[name = tensor<string, []>("op_1074_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1074_dilations_0 = const()[name = tensor<string, []>("op_1074_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1074_groups_0 = const()[name = tensor<string, []>("op_1074_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(45598208))), name = tensor<string, []>("layers_2_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [40571]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(45516992))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(62377280))), name = tensor<string, []>("layers_2_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [40571]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(62296064))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_1074_cast_fp16 = conv(dilations = var_1074_dilations_0, groups = var_1074_groups_0, pad = var_1074_pad_0, pad_type = var_1074_pad_type_0, strides = var_1074_strides_0, weight = layers_2_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_1074_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_5_cast_fp16 = add(x = var_1068_cast_fp16, y = var_1074_cast_fp16)[name = tensor<string, []>("p_5_cast_fp16")];
             tensor<int32, [4]> var_1078 = const()[name = tensor<string, []>("op_1078"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -776,22 +776,22 @@ program(1.0)
             tensor<int32, [4]> var_1131_pad_0 = const()[name = tensor<string, []>("op_1131_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1131_dilations_0 = const()[name = tensor<string, []>("op_1131_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1131_groups_0 = const()[name = tensor<string, []>("op_1131_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(45729344))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(46253696))), name = tensor<string, []>("layers_2_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(62508416))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63294912))), name = tensor<string, []>("layers_2_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1131_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1131_dilations_0, groups = var_1131_groups_0, pad = var_1131_pad_0, pad_type = var_1131_pad_type_0, strides = var_1131_strides_0, weight = layers_2_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_75_cast_fp16)[name = tensor<string, []>("op_1131_cast_fp16")];
             tensor<string, []> var_1137_pad_type_0 = const()[name = tensor<string, []>("op_1137_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1137_strides_0 = const()[name = tensor<string, []>("op_1137_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1137_pad_0 = const()[name = tensor<string, []>("op_1137_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1137_dilations_0 = const()[name = tensor<string, []>("op_1137_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1137_groups_0 = const()[name = tensor<string, []>("op_1137_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(46286400))), name = tensor<string, []>("layers_2_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16225]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(46253824))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63327680))), name = tensor<string, []>("layers_2_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16225]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63295104))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1137_cast_fp16 = conv(dilations = var_1137_dilations_0, groups = var_1137_groups_0, pad = var_1137_pad_0, pad_type = var_1137_pad_type_0, strides = var_1137_strides_0, weight = layers_2_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_75_cast_fp16)[name = tensor<string, []>("op_1137_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_13_cast_fp16 = add(x = var_1131_cast_fp16, y = var_1137_cast_fp16)[name = tensor<string, []>("obj_13_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_25_cast_fp16 = add(x = inputs_23_cast_fp16, y = obj_13_cast_fp16)[name = tensor<string, []>("inputs_25_cast_fp16")];
             tensor<int32, [1]> out_25_axes_0 = const()[name = tensor<string, []>("out_25_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1148_to_fp16 = const()[name = tensor<string, []>("op_1148_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_25_cast_fp16 = layer_norm(axes = out_25_axes_0, epsilon = var_1148_to_fp16, x = inputs_25_cast_fp16)[name = tensor<string, []>("out_25_cast_fp16")];
-            tensor<fp16, [1024]> input_77_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_77_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(46417536)))];
-            tensor<fp16, [1024]> input_77_beta_0_to_fp16 = const()[name = tensor<string, []>("input_77_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(46419648)))];
+            tensor<fp16, [1024]> input_77_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_77_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63458816)))];
+            tensor<fp16, [1024]> input_77_beta_0_to_fp16 = const()[name = tensor<string, []>("input_77_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63460928)))];
             tensor<fp16, []> input_77_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_77_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_77_cast_fp16 = batch_norm(beta = input_77_beta_0_to_fp16, epsilon = input_77_epsilon_0_to_fp16, gamma = input_77_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_25_cast_fp16)[name = tensor<string, []>("input_77_cast_fp16")];
             tensor<string, []> var_1169_pad_type_0 = const()[name = tensor<string, []>("op_1169_pad_type_0"), val = tensor<string, []>("valid")];
@@ -799,14 +799,14 @@ program(1.0)
             tensor<int32, [4]> var_1169_pad_0 = const()[name = tensor<string, []>("op_1169_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1169_dilations_0 = const()[name = tensor<string, []>("op_1169_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1169_groups_0 = const()[name = tensor<string, []>("op_1169_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_2_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(46421760))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(47470400))), name = tensor<string, []>("layers_2_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_2_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63463040))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65035968))), name = tensor<string, []>("layers_2_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_1169_cast_fp16 = conv(dilations = var_1169_dilations_0, groups = var_1169_groups_0, pad = var_1169_pad_0, pad_type = var_1169_pad_type_0, strides = var_1169_strides_0, weight = layers_2_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_77_cast_fp16)[name = tensor<string, []>("op_1169_cast_fp16")];
             tensor<string, []> var_1175_pad_type_0 = const()[name = tensor<string, []>("op_1175_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1175_strides_0 = const()[name = tensor<string, []>("op_1175_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1175_pad_0 = const()[name = tensor<string, []>("op_1175_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1175_dilations_0 = const()[name = tensor<string, []>("op_1175_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1175_groups_0 = const()[name = tensor<string, []>("op_1175_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_2_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(47536832))), name = tensor<string, []>("layers_2_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33120]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(47470528))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_2_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65102464))), name = tensor<string, []>("layers_2_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33120]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65036160))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_1175_cast_fp16 = conv(dilations = var_1175_dilations_0, groups = var_1175_groups_0, pad = var_1175_pad_0, pad_type = var_1175_pad_type_0, strides = var_1175_strides_0, weight = layers_2_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_77_cast_fp16)[name = tensor<string, []>("op_1175_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_79_cast_fp16 = add(x = var_1169_cast_fp16, y = var_1175_cast_fp16)[name = tensor<string, []>("input_79_cast_fp16")];
             tensor<int32, []> input_81_split_num_splits_0 = const()[name = tensor<string, []>("input_81_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -819,8 +819,8 @@ program(1.0)
             tensor<int32, []> input_83_groups_0 = const()[name = tensor<string, []>("input_83_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_83_strides_0 = const()[name = tensor<string, []>("input_83_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_83_dilations_0 = const()[name = tensor<string, []>("input_83_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_272_to_fp16 = const()[name = tensor<string, []>("const_272_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(47799040)))];
-            tensor<fp16, [1024]> const_273_to_fp16 = const()[name = tensor<string, []>("const_273_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(47817536)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_272_to_fp16 = const()[name = tensor<string, []>("const_272_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65364672)))];
+            tensor<fp16, [1024]> const_273_to_fp16 = const()[name = tensor<string, []>("const_273_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65383168)))];
             tensor<fp16, [1, 1024, 1, 188]> input_85_cast_fp16 = conv(bias = const_273_to_fp16, dilations = input_83_dilations_0, groups = input_83_groups_0, pad = input_83_pad_0, pad_type = input_83_pad_type_0, strides = input_83_strides_0, weight = const_272_to_fp16, x = input_81_cast_fp16)[name = tensor<string, []>("input_85_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_87_cast_fp16 = silu(x = input_85_cast_fp16)[name = tensor<string, []>("input_87_cast_fp16")];
             tensor<string, []> var_1197_pad_type_0 = const()[name = tensor<string, []>("op_1197_pad_type_0"), val = tensor<string, []>("valid")];
@@ -828,22 +828,22 @@ program(1.0)
             tensor<int32, [4]> var_1197_pad_0 = const()[name = tensor<string, []>("op_1197_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1197_dilations_0 = const()[name = tensor<string, []>("op_1197_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1197_groups_0 = const()[name = tensor<string, []>("op_1197_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(47819648))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(48344000))), name = tensor<string, []>("layers_2_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65385280))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(66171776))), name = tensor<string, []>("layers_2_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1197_cast_fp16 = conv(dilations = var_1197_dilations_0, groups = var_1197_groups_0, pad = var_1197_pad_0, pad_type = var_1197_pad_type_0, strides = var_1197_strides_0, weight = layers_2_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_87_cast_fp16)[name = tensor<string, []>("op_1197_cast_fp16")];
             tensor<string, []> var_1203_pad_type_0 = const()[name = tensor<string, []>("op_1203_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1203_strides_0 = const()[name = tensor<string, []>("op_1203_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1203_pad_0 = const()[name = tensor<string, []>("op_1203_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1203_dilations_0 = const()[name = tensor<string, []>("op_1203_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1203_groups_0 = const()[name = tensor<string, []>("op_1203_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_2_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(48378112))), name = tensor<string, []>("layers_2_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16934]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(48344128))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_2_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(66205952))), name = tensor<string, []>("layers_2_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16934]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(66171968))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1203_cast_fp16 = conv(dilations = var_1203_dilations_0, groups = var_1203_groups_0, pad = var_1203_pad_0, pad_type = var_1203_pad_type_0, strides = var_1203_strides_0, weight = layers_2_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_87_cast_fp16)[name = tensor<string, []>("op_1203_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_17_cast_fp16 = add(x = var_1197_cast_fp16, y = var_1203_cast_fp16)[name = tensor<string, []>("x_17_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_27_cast_fp16 = add(x = inputs_25_cast_fp16, y = x_17_cast_fp16)[name = tensor<string, []>("inputs_27_cast_fp16")];
             tensor<int32, [1]> out_27_axes_0 = const()[name = tensor<string, []>("out_27_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1214_to_fp16 = const()[name = tensor<string, []>("op_1214_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_27_cast_fp16 = layer_norm(axes = out_27_axes_0, epsilon = var_1214_to_fp16, x = inputs_27_cast_fp16)[name = tensor<string, []>("out_27_cast_fp16")];
-            tensor<fp16, [1024]> input_89_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_89_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(48509248)))];
-            tensor<fp16, [1024]> input_89_beta_0_to_fp16 = const()[name = tensor<string, []>("input_89_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(48511360)))];
+            tensor<fp16, [1024]> input_89_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_89_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(66337088)))];
+            tensor<fp16, [1024]> input_89_beta_0_to_fp16 = const()[name = tensor<string, []>("input_89_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(66339200)))];
             tensor<fp16, []> input_89_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_89_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_89_cast_fp16 = batch_norm(beta = input_89_beta_0_to_fp16, epsilon = input_89_epsilon_0_to_fp16, gamma = input_89_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_27_cast_fp16)[name = tensor<string, []>("input_89_cast_fp16")];
             tensor<string, []> var_1234_pad_type_0 = const()[name = tensor<string, []>("op_1234_pad_type_0"), val = tensor<string, []>("valid")];
@@ -851,14 +851,14 @@ program(1.0)
             tensor<int32, [4]> var_1234_pad_0 = const()[name = tensor<string, []>("op_1234_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1234_dilations_0 = const()[name = tensor<string, []>("op_1234_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1234_groups_0 = const()[name = tensor<string, []>("op_1234_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_2_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(48513472))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50610688))), name = tensor<string, []>("layers_2_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_2_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(66341312))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(69487104))), name = tensor<string, []>("layers_2_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_1234_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_1234_dilations_0, groups = var_1234_groups_0, pad = var_1234_pad_0, pad_type = var_1234_pad_type_0, strides = var_1234_strides_0, weight = layers_2_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_89_cast_fp16)[name = tensor<string, []>("op_1234_cast_fp16")];
             tensor<string, []> var_1240_pad_type_0 = const()[name = tensor<string, []>("op_1240_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1240_strides_0 = const()[name = tensor<string, []>("op_1240_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1240_pad_0 = const()[name = tensor<string, []>("op_1240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1240_dilations_0 = const()[name = tensor<string, []>("op_1240_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1240_groups_0 = const()[name = tensor<string, []>("op_1240_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_2_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50765632))), name = tensor<string, []>("layers_2_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [77373]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(50610816))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_2_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(69642112))), name = tensor<string, []>("layers_2_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [77373]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(69487296))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_1240_cast_fp16 = conv(dilations = var_1240_dilations_0, groups = var_1240_groups_0, pad = var_1240_pad_0, pad_type = var_1240_pad_type_0, strides = var_1240_strides_0, weight = layers_2_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_89_cast_fp16)[name = tensor<string, []>("op_1240_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_91_cast_fp16 = add(x = var_1234_cast_fp16, y = var_1240_cast_fp16)[name = tensor<string, []>("input_91_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_93_cast_fp16 = silu(x = input_91_cast_fp16)[name = tensor<string, []>("input_93_cast_fp16")];
@@ -867,14 +867,14 @@ program(1.0)
             tensor<int32, [4]> var_1251_pad_0 = const()[name = tensor<string, []>("op_1251_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1251_dilations_0 = const()[name = tensor<string, []>("op_1251_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1251_groups_0 = const()[name = tensor<string, []>("op_1251_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_2_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(51289984))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(53387200))), name = tensor<string, []>("layers_2_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_2_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(70166464))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(73312256))), name = tensor<string, []>("layers_2_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1251_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1251_dilations_0, groups = var_1251_groups_0, pad = var_1251_pad_0, pad_type = var_1251_pad_type_0, strides = var_1251_strides_0, weight = layers_2_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_93_cast_fp16)[name = tensor<string, []>("op_1251_cast_fp16")];
             tensor<string, []> var_1257_pad_type_0 = const()[name = tensor<string, []>("op_1257_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1257_strides_0 = const()[name = tensor<string, []>("op_1257_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1257_pad_0 = const()[name = tensor<string, []>("op_1257_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1257_dilations_0 = const()[name = tensor<string, []>("op_1257_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1257_groups_0 = const()[name = tensor<string, []>("op_1257_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_2_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(53562752))), name = tensor<string, []>("layers_2_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [87677]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(53387328))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_2_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(73487872))), name = tensor<string, []>("layers_2_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [87677]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(73312448))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1257_cast_fp16 = conv(dilations = var_1257_dilations_0, groups = var_1257_groups_0, pad = var_1257_pad_0, pad_type = var_1257_pad_type_0, strides = var_1257_strides_0, weight = layers_2_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_93_cast_fp16)[name = tensor<string, []>("op_1257_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_19_cast_fp16 = add(x = var_1251_cast_fp16, y = var_1257_cast_fp16)[name = tensor<string, []>("x_19_cast_fp16")];
             tensor<fp16, []> var_1259_to_fp16 = const()[name = tensor<string, []>("op_1259_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -883,16 +883,16 @@ program(1.0)
             tensor<int32, [1]> out_29_axes_0 = const()[name = tensor<string, []>("out_29_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1270_to_fp16 = const()[name = tensor<string, []>("op_1270_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_29_cast_fp16 = layer_norm(axes = out_29_axes_0, epsilon = var_1270_to_fp16, x = inputs_29_cast_fp16)[name = tensor<string, []>("out_29_cast_fp16")];
-            tensor<fp16, [1024]> inputs_31_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_31_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(54087104)))];
-            tensor<fp16, [1024]> inputs_31_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_31_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(54089216)))];
+            tensor<fp16, [1024]> inputs_31_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_31_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(74012224)))];
+            tensor<fp16, [1024]> inputs_31_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_31_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(74014336)))];
             tensor<fp16, []> inputs_31_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_31_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_31_cast_fp16 = batch_norm(beta = inputs_31_beta_0_to_fp16, epsilon = inputs_31_epsilon_0_to_fp16, gamma = inputs_31_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_29_cast_fp16)[name = tensor<string, []>("inputs_31_cast_fp16")];
             tensor<int32, []> var_1284 = const()[name = tensor<string, []>("op_1284"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_31_axes_0 = const()[name = tensor<string, []>("out_31_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1315_to_fp16 = const()[name = tensor<string, []>("op_1315_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_31_cast_fp16 = layer_norm(axes = out_31_axes_0, epsilon = var_1315_to_fp16, x = inputs_31_cast_fp16)[name = tensor<string, []>("out_31_cast_fp16")];
-            tensor<fp16, [1024]> input_95_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_95_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(54091328)))];
-            tensor<fp16, [1024]> input_95_beta_0_to_fp16 = const()[name = tensor<string, []>("input_95_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(54093440)))];
+            tensor<fp16, [1024]> input_95_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_95_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(74016448)))];
+            tensor<fp16, [1024]> input_95_beta_0_to_fp16 = const()[name = tensor<string, []>("input_95_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(74018560)))];
             tensor<fp16, []> input_95_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_95_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_95_cast_fp16 = batch_norm(beta = input_95_beta_0_to_fp16, epsilon = input_95_epsilon_0_to_fp16, gamma = input_95_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_31_cast_fp16)[name = tensor<string, []>("input_95_cast_fp16")];
             tensor<string, []> var_1335_pad_type_0 = const()[name = tensor<string, []>("op_1335_pad_type_0"), val = tensor<string, []>("valid")];
@@ -900,14 +900,14 @@ program(1.0)
             tensor<int32, [4]> var_1335_pad_0 = const()[name = tensor<string, []>("op_1335_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1335_dilations_0 = const()[name = tensor<string, []>("op_1335_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1335_groups_0 = const()[name = tensor<string, []>("op_1335_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_3_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(54095552))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(56192768))), name = tensor<string, []>("layers_3_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_3_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(74020672))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77166464))), name = tensor<string, []>("layers_3_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_1335_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_1335_dilations_0, groups = var_1335_groups_0, pad = var_1335_pad_0, pad_type = var_1335_pad_type_0, strides = var_1335_strides_0, weight = layers_3_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_95_cast_fp16)[name = tensor<string, []>("op_1335_cast_fp16")];
             tensor<string, []> var_1341_pad_type_0 = const()[name = tensor<string, []>("op_1341_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1341_strides_0 = const()[name = tensor<string, []>("op_1341_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1341_pad_0 = const()[name = tensor<string, []>("op_1341_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1341_dilations_0 = const()[name = tensor<string, []>("op_1341_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1341_groups_0 = const()[name = tensor<string, []>("op_1341_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_3_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(56333504))), name = tensor<string, []>("layers_3_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [70250]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(56192896))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_3_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77307264))), name = tensor<string, []>("layers_3_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [70250]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77166656))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_1341_cast_fp16 = conv(dilations = var_1341_dilations_0, groups = var_1341_groups_0, pad = var_1341_pad_0, pad_type = var_1341_pad_type_0, strides = var_1341_strides_0, weight = layers_3_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_95_cast_fp16)[name = tensor<string, []>("op_1341_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_97_cast_fp16 = add(x = var_1335_cast_fp16, y = var_1341_cast_fp16)[name = tensor<string, []>("input_97_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_99_cast_fp16 = silu(x = input_97_cast_fp16)[name = tensor<string, []>("input_99_cast_fp16")];
@@ -916,14 +916,14 @@ program(1.0)
             tensor<int32, [4]> var_1352_pad_0 = const()[name = tensor<string, []>("op_1352_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1352_dilations_0 = const()[name = tensor<string, []>("op_1352_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1352_groups_0 = const()[name = tensor<string, []>("op_1352_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_3_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(56857856))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(58955072))), name = tensor<string, []>("layers_3_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_3_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77831616))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(80977408))), name = tensor<string, []>("layers_3_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1352_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1352_dilations_0, groups = var_1352_groups_0, pad = var_1352_pad_0, pad_type = var_1352_pad_type_0, strides = var_1352_strides_0, weight = layers_3_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_99_cast_fp16)[name = tensor<string, []>("op_1352_cast_fp16")];
             tensor<string, []> var_1358_pad_type_0 = const()[name = tensor<string, []>("op_1358_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1358_strides_0 = const()[name = tensor<string, []>("op_1358_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1358_pad_0 = const()[name = tensor<string, []>("op_1358_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1358_dilations_0 = const()[name = tensor<string, []>("op_1358_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1358_groups_0 = const()[name = tensor<string, []>("op_1358_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_3_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(59134976))), name = tensor<string, []>("layers_3_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89831]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(58955200))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_3_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81157376))), name = tensor<string, []>("layers_3_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89831]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(80977600))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1358_cast_fp16 = conv(dilations = var_1358_dilations_0, groups = var_1358_groups_0, pad = var_1358_pad_0, pad_type = var_1358_pad_type_0, strides = var_1358_strides_0, weight = layers_3_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_99_cast_fp16)[name = tensor<string, []>("op_1358_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_21_cast_fp16 = add(x = var_1352_cast_fp16, y = var_1358_cast_fp16)[name = tensor<string, []>("x_21_cast_fp16")];
             tensor<fp16, []> var_1360_to_fp16 = const()[name = tensor<string, []>("op_1360_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -932,8 +932,8 @@ program(1.0)
             tensor<int32, [1]> out_33_axes_0 = const()[name = tensor<string, []>("out_33_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1371_to_fp16 = const()[name = tensor<string, []>("op_1371_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_33_cast_fp16 = layer_norm(axes = out_33_axes_0, epsilon = var_1371_to_fp16, x = inputs_33_cast_fp16)[name = tensor<string, []>("out_33_cast_fp16")];
-            tensor<fp16, [1024]> obj_15_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_15_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(59659328)))];
-            tensor<fp16, [1024]> obj_15_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_15_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(59661440)))];
+            tensor<fp16, [1024]> obj_15_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_15_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81681728)))];
+            tensor<fp16, [1024]> obj_15_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_15_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81683840)))];
             tensor<fp16, []> obj_15_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_15_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_15_cast_fp16 = batch_norm(beta = obj_15_beta_0_to_fp16, epsilon = obj_15_epsilon_0_to_fp16, gamma = obj_15_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_33_cast_fp16)[name = tensor<string, []>("obj_15_cast_fp16")];
             tensor<string, []> var_1396_pad_type_0 = const()[name = tensor<string, []>("op_1396_pad_type_0"), val = tensor<string, []>("valid")];
@@ -941,14 +941,14 @@ program(1.0)
             tensor<int32, [4]> var_1396_pad_0 = const()[name = tensor<string, []>("op_1396_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1396_dilations_0 = const()[name = tensor<string, []>("op_1396_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1396_groups_0 = const()[name = tensor<string, []>("op_1396_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(59663552))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60187904))), name = tensor<string, []>("layers_3_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81685952))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(82472448))), name = tensor<string, []>("layers_3_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1396_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1396_dilations_0, groups = var_1396_groups_0, pad = var_1396_pad_0, pad_type = var_1396_pad_type_0, strides = var_1396_strides_0, weight = layers_3_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_15_cast_fp16)[name = tensor<string, []>("op_1396_cast_fp16")];
             tensor<string, []> var_1402_pad_type_0 = const()[name = tensor<string, []>("op_1402_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1402_strides_0 = const()[name = tensor<string, []>("op_1402_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1402_pad_0 = const()[name = tensor<string, []>("op_1402_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1402_dilations_0 = const()[name = tensor<string, []>("op_1402_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1402_groups_0 = const()[name = tensor<string, []>("op_1402_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60224896))), name = tensor<string, []>("layers_3_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18387]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60188032))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(82509504))), name = tensor<string, []>("layers_3_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18387]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(82472640))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1402_cast_fp16 = conv(dilations = var_1402_dilations_0, groups = var_1402_groups_0, pad = var_1402_pad_0, pad_type = var_1402_pad_type_0, strides = var_1402_strides_0, weight = layers_3_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_15_cast_fp16)[name = tensor<string, []>("op_1402_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_13_cast_fp16 = add(x = var_1396_cast_fp16, y = var_1402_cast_fp16)[name = tensor<string, []>("query_13_cast_fp16")];
             tensor<string, []> var_1411_pad_type_0 = const()[name = tensor<string, []>("op_1411_pad_type_0"), val = tensor<string, []>("valid")];
@@ -956,14 +956,14 @@ program(1.0)
             tensor<int32, [4]> var_1411_pad_0 = const()[name = tensor<string, []>("op_1411_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1411_dilations_0 = const()[name = tensor<string, []>("op_1411_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1411_groups_0 = const()[name = tensor<string, []>("op_1411_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60356032))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60880384))), name = tensor<string, []>("layers_3_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(82640640))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(83427136))), name = tensor<string, []>("layers_3_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1411_cast_fp16 = conv(dilations = var_1411_dilations_0, groups = var_1411_groups_0, pad = var_1411_pad_0, pad_type = var_1411_pad_type_0, strides = var_1411_strides_0, weight = layers_3_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_15_cast_fp16)[name = tensor<string, []>("op_1411_cast_fp16")];
             tensor<string, []> var_1417_pad_type_0 = const()[name = tensor<string, []>("op_1417_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1417_strides_0 = const()[name = tensor<string, []>("op_1417_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1417_pad_0 = const()[name = tensor<string, []>("op_1417_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1417_dilations_0 = const()[name = tensor<string, []>("op_1417_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1417_groups_0 = const()[name = tensor<string, []>("op_1417_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60919872))), name = tensor<string, []>("layers_3_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19632]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(60880512))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(83466688))), name = tensor<string, []>("layers_3_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19632]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(83427328))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1417_cast_fp16 = conv(dilations = var_1417_dilations_0, groups = var_1417_groups_0, pad = var_1417_pad_0, pad_type = var_1417_pad_type_0, strides = var_1417_strides_0, weight = layers_3_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_15_cast_fp16)[name = tensor<string, []>("op_1417_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_7_cast_fp16 = add(x = var_1411_cast_fp16, y = var_1417_cast_fp16)[name = tensor<string, []>("key_7_cast_fp16")];
             tensor<string, []> var_1427_pad_type_0 = const()[name = tensor<string, []>("op_1427_pad_type_0"), val = tensor<string, []>("valid")];
@@ -971,33 +971,33 @@ program(1.0)
             tensor<int32, [4]> var_1427_pad_0 = const()[name = tensor<string, []>("op_1427_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1427_dilations_0 = const()[name = tensor<string, []>("op_1427_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1427_groups_0 = const()[name = tensor<string, []>("op_1427_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61051008))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61575360))), name = tensor<string, []>("layers_3_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(83597824))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84384320))), name = tensor<string, []>("layers_3_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1427_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1427_dilations_0, groups = var_1427_groups_0, pad = var_1427_pad_0, pad_type = var_1427_pad_type_0, strides = var_1427_strides_0, weight = layers_3_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_15_cast_fp16)[name = tensor<string, []>("op_1427_cast_fp16")];
             tensor<string, []> var_1433_pad_type_0 = const()[name = tensor<string, []>("op_1433_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1433_strides_0 = const()[name = tensor<string, []>("op_1433_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1433_pad_0 = const()[name = tensor<string, []>("op_1433_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1433_dilations_0 = const()[name = tensor<string, []>("op_1433_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1433_groups_0 = const()[name = tensor<string, []>("op_1433_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61607872))), name = tensor<string, []>("layers_3_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16159]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61575488))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84416896))), name = tensor<string, []>("layers_3_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16159]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84384512))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1433_cast_fp16 = conv(dilations = var_1433_dilations_0, groups = var_1433_groups_0, pad = var_1433_pad_0, pad_type = var_1433_pad_type_0, strides = var_1433_strides_0, weight = layers_3_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_15_cast_fp16)[name = tensor<string, []>("op_1433_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_7_cast_fp16 = add(x = var_1427_cast_fp16, y = var_1433_cast_fp16)[name = tensor<string, []>("value_7_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_1436_to_fp16 = const()[name = tensor<string, []>("op_1436_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61739008)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1436_to_fp16 = const()[name = tensor<string, []>("op_1436_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84548032)))];
             tensor<fp16, [1, 1024, 1, 188]> query_15_cast_fp16 = add(x = query_13_cast_fp16, y = var_1436_to_fp16)[name = tensor<string, []>("query_15_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_1439_to_fp16 = const()[name = tensor<string, []>("op_1439_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61741120)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1439_to_fp16 = const()[name = tensor<string, []>("op_1439_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84550144)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_7_cast_fp16 = add(x = query_13_cast_fp16, y = var_1439_to_fp16)[name = tensor<string, []>("q_with_bias_v_7_cast_fp16")];
             tensor<string, []> var_1449_pad_type_0 = const()[name = tensor<string, []>("op_1449_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1449_strides_0 = const()[name = tensor<string, []>("op_1449_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1449_pad_0 = const()[name = tensor<string, []>("op_1449_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1449_dilations_0 = const()[name = tensor<string, []>("op_1449_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1449_groups_0 = const()[name = tensor<string, []>("op_1449_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(61743232))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(62267584))), name = tensor<string, []>("layers_3_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84552256))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(85338752))), name = tensor<string, []>("layers_3_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_1449_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1449_dilations_0, groups = var_1449_groups_0, pad = var_1449_pad_0, pad_type = var_1449_pad_type_0, strides = var_1449_strides_0, weight = layers_3_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_1449_cast_fp16")];
             tensor<string, []> var_1455_pad_type_0 = const()[name = tensor<string, []>("op_1455_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1455_strides_0 = const()[name = tensor<string, []>("op_1455_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1455_pad_0 = const()[name = tensor<string, []>("op_1455_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1455_dilations_0 = const()[name = tensor<string, []>("op_1455_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1455_groups_0 = const()[name = tensor<string, []>("op_1455_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(62346048))), name = tensor<string, []>("layers_3_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [39125]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(62267712))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(85417280))), name = tensor<string, []>("layers_3_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [39125]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(85338944))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_1455_cast_fp16 = conv(dilations = var_1455_dilations_0, groups = var_1455_groups_0, pad = var_1455_pad_0, pad_type = var_1455_pad_type_0, strides = var_1455_strides_0, weight = layers_3_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_1455_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_7_cast_fp16 = add(x = var_1449_cast_fp16, y = var_1455_cast_fp16)[name = tensor<string, []>("p_7_cast_fp16")];
             tensor<int32, [4]> var_1459 = const()[name = tensor<string, []>("op_1459"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -1048,22 +1048,22 @@ program(1.0)
             tensor<int32, [4]> var_1512_pad_0 = const()[name = tensor<string, []>("op_1512_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1512_dilations_0 = const()[name = tensor<string, []>("op_1512_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1512_groups_0 = const()[name = tensor<string, []>("op_1512_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(62477184))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63001536))), name = tensor<string, []>("layers_3_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(85548416))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(86334912))), name = tensor<string, []>("layers_3_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1512_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1512_dilations_0, groups = var_1512_groups_0, pad = var_1512_pad_0, pad_type = var_1512_pad_type_0, strides = var_1512_strides_0, weight = layers_3_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_101_cast_fp16)[name = tensor<string, []>("op_1512_cast_fp16")];
             tensor<string, []> var_1518_pad_type_0 = const()[name = tensor<string, []>("op_1518_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1518_strides_0 = const()[name = tensor<string, []>("op_1518_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1518_pad_0 = const()[name = tensor<string, []>("op_1518_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1518_dilations_0 = const()[name = tensor<string, []>("op_1518_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1518_groups_0 = const()[name = tensor<string, []>("op_1518_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63035072))), name = tensor<string, []>("layers_3_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16652]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63001664))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(86368512))), name = tensor<string, []>("layers_3_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16652]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(86335104))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1518_cast_fp16 = conv(dilations = var_1518_dilations_0, groups = var_1518_groups_0, pad = var_1518_pad_0, pad_type = var_1518_pad_type_0, strides = var_1518_strides_0, weight = layers_3_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_101_cast_fp16)[name = tensor<string, []>("op_1518_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_17_cast_fp16 = add(x = var_1512_cast_fp16, y = var_1518_cast_fp16)[name = tensor<string, []>("obj_17_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_35_cast_fp16 = add(x = inputs_33_cast_fp16, y = obj_17_cast_fp16)[name = tensor<string, []>("inputs_35_cast_fp16")];
             tensor<int32, [1]> out_35_axes_0 = const()[name = tensor<string, []>("out_35_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1529_to_fp16 = const()[name = tensor<string, []>("op_1529_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_35_cast_fp16 = layer_norm(axes = out_35_axes_0, epsilon = var_1529_to_fp16, x = inputs_35_cast_fp16)[name = tensor<string, []>("out_35_cast_fp16")];
-            tensor<fp16, [1024]> input_103_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_103_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63166208)))];
-            tensor<fp16, [1024]> input_103_beta_0_to_fp16 = const()[name = tensor<string, []>("input_103_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63168320)))];
+            tensor<fp16, [1024]> input_103_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_103_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(86499648)))];
+            tensor<fp16, [1024]> input_103_beta_0_to_fp16 = const()[name = tensor<string, []>("input_103_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(86501760)))];
             tensor<fp16, []> input_103_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_103_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_103_cast_fp16 = batch_norm(beta = input_103_beta_0_to_fp16, epsilon = input_103_epsilon_0_to_fp16, gamma = input_103_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_35_cast_fp16)[name = tensor<string, []>("input_103_cast_fp16")];
             tensor<string, []> var_1550_pad_type_0 = const()[name = tensor<string, []>("op_1550_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1071,14 +1071,14 @@ program(1.0)
             tensor<int32, [4]> var_1550_pad_0 = const()[name = tensor<string, []>("op_1550_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1550_dilations_0 = const()[name = tensor<string, []>("op_1550_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1550_groups_0 = const()[name = tensor<string, []>("op_1550_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_3_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(63170432))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64219072))), name = tensor<string, []>("layers_3_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_3_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(86503872))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(88076800))), name = tensor<string, []>("layers_3_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_1550_cast_fp16 = conv(dilations = var_1550_dilations_0, groups = var_1550_groups_0, pad = var_1550_pad_0, pad_type = var_1550_pad_type_0, strides = var_1550_strides_0, weight = layers_3_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_103_cast_fp16)[name = tensor<string, []>("op_1550_cast_fp16")];
             tensor<string, []> var_1556_pad_type_0 = const()[name = tensor<string, []>("op_1556_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1556_strides_0 = const()[name = tensor<string, []>("op_1556_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1556_pad_0 = const()[name = tensor<string, []>("op_1556_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1556_dilations_0 = const()[name = tensor<string, []>("op_1556_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1556_groups_0 = const()[name = tensor<string, []>("op_1556_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_3_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64284672))), name = tensor<string, []>("layers_3_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32686]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64219200))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_3_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(88142464))), name = tensor<string, []>("layers_3_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32686]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(88076992))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_1556_cast_fp16 = conv(dilations = var_1556_dilations_0, groups = var_1556_groups_0, pad = var_1556_pad_0, pad_type = var_1556_pad_type_0, strides = var_1556_strides_0, weight = layers_3_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_103_cast_fp16)[name = tensor<string, []>("op_1556_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_105_cast_fp16 = add(x = var_1550_cast_fp16, y = var_1556_cast_fp16)[name = tensor<string, []>("input_105_cast_fp16")];
             tensor<int32, []> input_107_split_num_splits_0 = const()[name = tensor<string, []>("input_107_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -1091,8 +1091,8 @@ program(1.0)
             tensor<int32, []> input_109_groups_0 = const()[name = tensor<string, []>("input_109_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_109_strides_0 = const()[name = tensor<string, []>("input_109_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_109_dilations_0 = const()[name = tensor<string, []>("input_109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_274_to_fp16 = const()[name = tensor<string, []>("const_274_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64546880)))];
-            tensor<fp16, [1024]> const_275_to_fp16 = const()[name = tensor<string, []>("const_275_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64565376)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_274_to_fp16 = const()[name = tensor<string, []>("const_274_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(88404672)))];
+            tensor<fp16, [1024]> const_275_to_fp16 = const()[name = tensor<string, []>("const_275_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(88423168)))];
             tensor<fp16, [1, 1024, 1, 188]> input_111_cast_fp16 = conv(bias = const_275_to_fp16, dilations = input_109_dilations_0, groups = input_109_groups_0, pad = input_109_pad_0, pad_type = input_109_pad_type_0, strides = input_109_strides_0, weight = const_274_to_fp16, x = input_107_cast_fp16)[name = tensor<string, []>("input_111_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_113_cast_fp16 = silu(x = input_111_cast_fp16)[name = tensor<string, []>("input_113_cast_fp16")];
             tensor<string, []> var_1578_pad_type_0 = const()[name = tensor<string, []>("op_1578_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1100,22 +1100,22 @@ program(1.0)
             tensor<int32, [4]> var_1578_pad_0 = const()[name = tensor<string, []>("op_1578_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1578_dilations_0 = const()[name = tensor<string, []>("op_1578_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1578_groups_0 = const()[name = tensor<string, []>("op_1578_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64567488))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65091840))), name = tensor<string, []>("layers_3_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(88425280))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(89211776))), name = tensor<string, []>("layers_3_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1578_cast_fp16 = conv(dilations = var_1578_dilations_0, groups = var_1578_groups_0, pad = var_1578_pad_0, pad_type = var_1578_pad_type_0, strides = var_1578_strides_0, weight = layers_3_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_113_cast_fp16)[name = tensor<string, []>("op_1578_cast_fp16")];
             tensor<string, []> var_1584_pad_type_0 = const()[name = tensor<string, []>("op_1584_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1584_strides_0 = const()[name = tensor<string, []>("op_1584_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1584_pad_0 = const()[name = tensor<string, []>("op_1584_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1584_dilations_0 = const()[name = tensor<string, []>("op_1584_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1584_groups_0 = const()[name = tensor<string, []>("op_1584_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_3_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65125696))), name = tensor<string, []>("layers_3_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16815]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65091968))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_3_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(89245696))), name = tensor<string, []>("layers_3_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16815]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(89211968))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1584_cast_fp16 = conv(dilations = var_1584_dilations_0, groups = var_1584_groups_0, pad = var_1584_pad_0, pad_type = var_1584_pad_type_0, strides = var_1584_strides_0, weight = layers_3_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_113_cast_fp16)[name = tensor<string, []>("op_1584_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_23_cast_fp16 = add(x = var_1578_cast_fp16, y = var_1584_cast_fp16)[name = tensor<string, []>("x_23_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_37_cast_fp16 = add(x = inputs_35_cast_fp16, y = x_23_cast_fp16)[name = tensor<string, []>("inputs_37_cast_fp16")];
             tensor<int32, [1]> out_37_axes_0 = const()[name = tensor<string, []>("out_37_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1595_to_fp16 = const()[name = tensor<string, []>("op_1595_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_37_cast_fp16 = layer_norm(axes = out_37_axes_0, epsilon = var_1595_to_fp16, x = inputs_37_cast_fp16)[name = tensor<string, []>("out_37_cast_fp16")];
-            tensor<fp16, [1024]> input_115_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_115_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65256832)))];
-            tensor<fp16, [1024]> input_115_beta_0_to_fp16 = const()[name = tensor<string, []>("input_115_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65258944)))];
+            tensor<fp16, [1024]> input_115_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_115_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(89376832)))];
+            tensor<fp16, [1024]> input_115_beta_0_to_fp16 = const()[name = tensor<string, []>("input_115_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(89378944)))];
             tensor<fp16, []> input_115_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_115_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_115_cast_fp16 = batch_norm(beta = input_115_beta_0_to_fp16, epsilon = input_115_epsilon_0_to_fp16, gamma = input_115_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_37_cast_fp16)[name = tensor<string, []>("input_115_cast_fp16")];
             tensor<string, []> var_1615_pad_type_0 = const()[name = tensor<string, []>("op_1615_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1123,14 +1123,14 @@ program(1.0)
             tensor<int32, [4]> var_1615_pad_0 = const()[name = tensor<string, []>("op_1615_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1615_dilations_0 = const()[name = tensor<string, []>("op_1615_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1615_groups_0 = const()[name = tensor<string, []>("op_1615_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_3_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(65261056))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(67358272))), name = tensor<string, []>("layers_3_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_3_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(89381056))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(92526848))), name = tensor<string, []>("layers_3_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_1615_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_1615_dilations_0, groups = var_1615_groups_0, pad = var_1615_pad_0, pad_type = var_1615_pad_type_0, strides = var_1615_strides_0, weight = layers_3_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_115_cast_fp16)[name = tensor<string, []>("op_1615_cast_fp16")];
             tensor<string, []> var_1621_pad_type_0 = const()[name = tensor<string, []>("op_1621_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1621_strides_0 = const()[name = tensor<string, []>("op_1621_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1621_pad_0 = const()[name = tensor<string, []>("op_1621_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1621_dilations_0 = const()[name = tensor<string, []>("op_1621_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1621_groups_0 = const()[name = tensor<string, []>("op_1621_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_3_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(67522048))), name = tensor<string, []>("layers_3_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81787]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(67358400))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_3_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(92690688))), name = tensor<string, []>("layers_3_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81787]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(92527040))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_1621_cast_fp16 = conv(dilations = var_1621_dilations_0, groups = var_1621_groups_0, pad = var_1621_pad_0, pad_type = var_1621_pad_type_0, strides = var_1621_strides_0, weight = layers_3_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_115_cast_fp16)[name = tensor<string, []>("op_1621_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_117_cast_fp16 = add(x = var_1615_cast_fp16, y = var_1621_cast_fp16)[name = tensor<string, []>("input_117_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_119_cast_fp16 = silu(x = input_117_cast_fp16)[name = tensor<string, []>("input_119_cast_fp16")];
@@ -1139,14 +1139,14 @@ program(1.0)
             tensor<int32, [4]> var_1632_pad_0 = const()[name = tensor<string, []>("op_1632_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1632_dilations_0 = const()[name = tensor<string, []>("op_1632_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1632_groups_0 = const()[name = tensor<string, []>("op_1632_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_3_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(68046400))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(70143616))), name = tensor<string, []>("layers_3_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_3_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(93215040))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96360832))), name = tensor<string, []>("layers_3_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1632_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1632_dilations_0, groups = var_1632_groups_0, pad = var_1632_pad_0, pad_type = var_1632_pad_type_0, strides = var_1632_strides_0, weight = layers_3_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_119_cast_fp16)[name = tensor<string, []>("op_1632_cast_fp16")];
             tensor<string, []> var_1638_pad_type_0 = const()[name = tensor<string, []>("op_1638_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1638_strides_0 = const()[name = tensor<string, []>("op_1638_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1638_pad_0 = const()[name = tensor<string, []>("op_1638_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1638_dilations_0 = const()[name = tensor<string, []>("op_1638_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1638_groups_0 = const()[name = tensor<string, []>("op_1638_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_3_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(70331584))), name = tensor<string, []>("layers_3_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [93877]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(70143744))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_3_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96548864))), name = tensor<string, []>("layers_3_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [93877]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96361024))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1638_cast_fp16 = conv(dilations = var_1638_dilations_0, groups = var_1638_groups_0, pad = var_1638_pad_0, pad_type = var_1638_pad_type_0, strides = var_1638_strides_0, weight = layers_3_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_119_cast_fp16)[name = tensor<string, []>("op_1638_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_25_cast_fp16 = add(x = var_1632_cast_fp16, y = var_1638_cast_fp16)[name = tensor<string, []>("x_25_cast_fp16")];
             tensor<fp16, []> var_1640_to_fp16 = const()[name = tensor<string, []>("op_1640_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -1155,16 +1155,16 @@ program(1.0)
             tensor<int32, [1]> out_39_axes_0 = const()[name = tensor<string, []>("out_39_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1651_to_fp16 = const()[name = tensor<string, []>("op_1651_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_39_cast_fp16 = layer_norm(axes = out_39_axes_0, epsilon = var_1651_to_fp16, x = inputs_39_cast_fp16)[name = tensor<string, []>("out_39_cast_fp16")];
-            tensor<fp16, [1024]> inputs_41_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_41_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(70855936)))];
-            tensor<fp16, [1024]> inputs_41_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_41_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(70858048)))];
+            tensor<fp16, [1024]> inputs_41_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_41_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(97073216)))];
+            tensor<fp16, [1024]> inputs_41_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_41_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(97075328)))];
             tensor<fp16, []> inputs_41_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_41_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_41_cast_fp16 = batch_norm(beta = inputs_41_beta_0_to_fp16, epsilon = inputs_41_epsilon_0_to_fp16, gamma = inputs_41_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_39_cast_fp16)[name = tensor<string, []>("inputs_41_cast_fp16")];
             tensor<int32, []> var_1665 = const()[name = tensor<string, []>("op_1665"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_41_axes_0 = const()[name = tensor<string, []>("out_41_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1696_to_fp16 = const()[name = tensor<string, []>("op_1696_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_41_cast_fp16 = layer_norm(axes = out_41_axes_0, epsilon = var_1696_to_fp16, x = inputs_41_cast_fp16)[name = tensor<string, []>("out_41_cast_fp16")];
-            tensor<fp16, [1024]> input_121_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_121_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(70860160)))];
-            tensor<fp16, [1024]> input_121_beta_0_to_fp16 = const()[name = tensor<string, []>("input_121_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(70862272)))];
+            tensor<fp16, [1024]> input_121_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_121_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(97077440)))];
+            tensor<fp16, [1024]> input_121_beta_0_to_fp16 = const()[name = tensor<string, []>("input_121_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(97079552)))];
             tensor<fp16, []> input_121_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_121_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_121_cast_fp16 = batch_norm(beta = input_121_beta_0_to_fp16, epsilon = input_121_epsilon_0_to_fp16, gamma = input_121_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_41_cast_fp16)[name = tensor<string, []>("input_121_cast_fp16")];
             tensor<string, []> var_1716_pad_type_0 = const()[name = tensor<string, []>("op_1716_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1172,14 +1172,14 @@ program(1.0)
             tensor<int32, [4]> var_1716_pad_0 = const()[name = tensor<string, []>("op_1716_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1716_dilations_0 = const()[name = tensor<string, []>("op_1716_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1716_groups_0 = const()[name = tensor<string, []>("op_1716_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_4_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(70864384))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(72961600))), name = tensor<string, []>("layers_4_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_4_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(97081664))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100227456))), name = tensor<string, []>("layers_4_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_1716_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_1716_dilations_0, groups = var_1716_groups_0, pad = var_1716_pad_0, pad_type = var_1716_pad_type_0, strides = var_1716_strides_0, weight = layers_4_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_121_cast_fp16)[name = tensor<string, []>("op_1716_cast_fp16")];
             tensor<string, []> var_1722_pad_type_0 = const()[name = tensor<string, []>("op_1722_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1722_strides_0 = const()[name = tensor<string, []>("op_1722_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1722_pad_0 = const()[name = tensor<string, []>("op_1722_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1722_dilations_0 = const()[name = tensor<string, []>("op_1722_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1722_groups_0 = const()[name = tensor<string, []>("op_1722_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_4_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(73108160))), name = tensor<string, []>("layers_4_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [73182]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(72961728))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_4_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100374080))), name = tensor<string, []>("layers_4_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [73182]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100227648))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_1722_cast_fp16 = conv(dilations = var_1722_dilations_0, groups = var_1722_groups_0, pad = var_1722_pad_0, pad_type = var_1722_pad_type_0, strides = var_1722_strides_0, weight = layers_4_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_121_cast_fp16)[name = tensor<string, []>("op_1722_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_123_cast_fp16 = add(x = var_1716_cast_fp16, y = var_1722_cast_fp16)[name = tensor<string, []>("input_123_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_125_cast_fp16 = silu(x = input_123_cast_fp16)[name = tensor<string, []>("input_125_cast_fp16")];
@@ -1188,14 +1188,14 @@ program(1.0)
             tensor<int32, [4]> var_1733_pad_0 = const()[name = tensor<string, []>("op_1733_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1733_dilations_0 = const()[name = tensor<string, []>("op_1733_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1733_groups_0 = const()[name = tensor<string, []>("op_1733_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_4_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(73632512))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(75729728))), name = tensor<string, []>("layers_4_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_4_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100898432))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104044224))), name = tensor<string, []>("layers_4_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1733_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1733_dilations_0, groups = var_1733_groups_0, pad = var_1733_pad_0, pad_type = var_1733_pad_type_0, strides = var_1733_strides_0, weight = layers_4_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_125_cast_fp16)[name = tensor<string, []>("op_1733_cast_fp16")];
             tensor<string, []> var_1739_pad_type_0 = const()[name = tensor<string, []>("op_1739_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1739_strides_0 = const()[name = tensor<string, []>("op_1739_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1739_pad_0 = const()[name = tensor<string, []>("op_1739_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1739_dilations_0 = const()[name = tensor<string, []>("op_1739_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1739_groups_0 = const()[name = tensor<string, []>("op_1739_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_4_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(75918528))), name = tensor<string, []>("layers_4_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [94276]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(75729856))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_4_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104233088))), name = tensor<string, []>("layers_4_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [94276]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104044416))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1739_cast_fp16 = conv(dilations = var_1739_dilations_0, groups = var_1739_groups_0, pad = var_1739_pad_0, pad_type = var_1739_pad_type_0, strides = var_1739_strides_0, weight = layers_4_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_125_cast_fp16)[name = tensor<string, []>("op_1739_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_27_cast_fp16 = add(x = var_1733_cast_fp16, y = var_1739_cast_fp16)[name = tensor<string, []>("x_27_cast_fp16")];
             tensor<fp16, []> var_1741_to_fp16 = const()[name = tensor<string, []>("op_1741_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -1204,8 +1204,8 @@ program(1.0)
             tensor<int32, [1]> out_43_axes_0 = const()[name = tensor<string, []>("out_43_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1752_to_fp16 = const()[name = tensor<string, []>("op_1752_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_43_cast_fp16 = layer_norm(axes = out_43_axes_0, epsilon = var_1752_to_fp16, x = inputs_43_cast_fp16)[name = tensor<string, []>("out_43_cast_fp16")];
-            tensor<fp16, [1024]> obj_19_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_19_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(76442880)))];
-            tensor<fp16, [1024]> obj_19_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_19_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(76444992)))];
+            tensor<fp16, [1024]> obj_19_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_19_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104757440)))];
+            tensor<fp16, [1024]> obj_19_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_19_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104759552)))];
             tensor<fp16, []> obj_19_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_19_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_19_cast_fp16 = batch_norm(beta = obj_19_beta_0_to_fp16, epsilon = obj_19_epsilon_0_to_fp16, gamma = obj_19_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_43_cast_fp16)[name = tensor<string, []>("obj_19_cast_fp16")];
             tensor<string, []> var_1777_pad_type_0 = const()[name = tensor<string, []>("op_1777_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1213,14 +1213,14 @@ program(1.0)
             tensor<int32, [4]> var_1777_pad_0 = const()[name = tensor<string, []>("op_1777_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1777_dilations_0 = const()[name = tensor<string, []>("op_1777_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1777_groups_0 = const()[name = tensor<string, []>("op_1777_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(76447104))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(76971456))), name = tensor<string, []>("layers_4_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104761664))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(105548160))), name = tensor<string, []>("layers_4_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1777_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1777_dilations_0, groups = var_1777_groups_0, pad = var_1777_pad_0, pad_type = var_1777_pad_type_0, strides = var_1777_strides_0, weight = layers_4_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_19_cast_fp16)[name = tensor<string, []>("op_1777_cast_fp16")];
             tensor<string, []> var_1783_pad_type_0 = const()[name = tensor<string, []>("op_1783_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1783_strides_0 = const()[name = tensor<string, []>("op_1783_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1783_pad_0 = const()[name = tensor<string, []>("op_1783_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1783_dilations_0 = const()[name = tensor<string, []>("op_1783_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1783_groups_0 = const()[name = tensor<string, []>("op_1783_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77007936))), name = tensor<string, []>("layers_4_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18138]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(76971584))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(105584704))), name = tensor<string, []>("layers_4_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18138]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(105548352))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1783_cast_fp16 = conv(dilations = var_1783_dilations_0, groups = var_1783_groups_0, pad = var_1783_pad_0, pad_type = var_1783_pad_type_0, strides = var_1783_strides_0, weight = layers_4_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_19_cast_fp16)[name = tensor<string, []>("op_1783_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_17_cast_fp16 = add(x = var_1777_cast_fp16, y = var_1783_cast_fp16)[name = tensor<string, []>("query_17_cast_fp16")];
             tensor<string, []> var_1792_pad_type_0 = const()[name = tensor<string, []>("op_1792_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1228,14 +1228,14 @@ program(1.0)
             tensor<int32, [4]> var_1792_pad_0 = const()[name = tensor<string, []>("op_1792_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1792_dilations_0 = const()[name = tensor<string, []>("op_1792_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1792_groups_0 = const()[name = tensor<string, []>("op_1792_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77139072))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77663424))), name = tensor<string, []>("layers_4_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(105715840))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(106502336))), name = tensor<string, []>("layers_4_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1792_cast_fp16 = conv(dilations = var_1792_dilations_0, groups = var_1792_groups_0, pad = var_1792_pad_0, pad_type = var_1792_pad_type_0, strides = var_1792_strides_0, weight = layers_4_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_19_cast_fp16)[name = tensor<string, []>("op_1792_cast_fp16")];
             tensor<string, []> var_1798_pad_type_0 = const()[name = tensor<string, []>("op_1798_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1798_strides_0 = const()[name = tensor<string, []>("op_1798_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1798_pad_0 = const()[name = tensor<string, []>("op_1798_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1798_dilations_0 = const()[name = tensor<string, []>("op_1798_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1798_groups_0 = const()[name = tensor<string, []>("op_1798_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77699776))), name = tensor<string, []>("layers_4_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18070]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77663552))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(106538752))), name = tensor<string, []>("layers_4_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18070]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(106502528))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1798_cast_fp16 = conv(dilations = var_1798_dilations_0, groups = var_1798_groups_0, pad = var_1798_pad_0, pad_type = var_1798_pad_type_0, strides = var_1798_strides_0, weight = layers_4_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_19_cast_fp16)[name = tensor<string, []>("op_1798_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_9_cast_fp16 = add(x = var_1792_cast_fp16, y = var_1798_cast_fp16)[name = tensor<string, []>("key_9_cast_fp16")];
             tensor<string, []> var_1808_pad_type_0 = const()[name = tensor<string, []>("op_1808_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1243,33 +1243,33 @@ program(1.0)
             tensor<int32, [4]> var_1808_pad_0 = const()[name = tensor<string, []>("op_1808_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1808_dilations_0 = const()[name = tensor<string, []>("op_1808_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1808_groups_0 = const()[name = tensor<string, []>("op_1808_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(77830912))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(78355264))), name = tensor<string, []>("layers_4_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(106669888))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(107456384))), name = tensor<string, []>("layers_4_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1808_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1808_dilations_0, groups = var_1808_groups_0, pad = var_1808_pad_0, pad_type = var_1808_pad_type_0, strides = var_1808_strides_0, weight = layers_4_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_19_cast_fp16)[name = tensor<string, []>("op_1808_cast_fp16")];
             tensor<string, []> var_1814_pad_type_0 = const()[name = tensor<string, []>("op_1814_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1814_strides_0 = const()[name = tensor<string, []>("op_1814_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1814_pad_0 = const()[name = tensor<string, []>("op_1814_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1814_dilations_0 = const()[name = tensor<string, []>("op_1814_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1814_groups_0 = const()[name = tensor<string, []>("op_1814_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(78388800))), name = tensor<string, []>("layers_4_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16664]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(78355392))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(107489984))), name = tensor<string, []>("layers_4_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16664]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(107456576))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1814_cast_fp16 = conv(dilations = var_1814_dilations_0, groups = var_1814_groups_0, pad = var_1814_pad_0, pad_type = var_1814_pad_type_0, strides = var_1814_strides_0, weight = layers_4_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_19_cast_fp16)[name = tensor<string, []>("op_1814_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_9_cast_fp16 = add(x = var_1808_cast_fp16, y = var_1814_cast_fp16)[name = tensor<string, []>("value_9_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_1817_to_fp16 = const()[name = tensor<string, []>("op_1817_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(78519936)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1817_to_fp16 = const()[name = tensor<string, []>("op_1817_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(107621120)))];
             tensor<fp16, [1, 1024, 1, 188]> query_19_cast_fp16 = add(x = query_17_cast_fp16, y = var_1817_to_fp16)[name = tensor<string, []>("query_19_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_1820_to_fp16 = const()[name = tensor<string, []>("op_1820_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(78522048)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_1820_to_fp16 = const()[name = tensor<string, []>("op_1820_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(107623232)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_9_cast_fp16 = add(x = query_17_cast_fp16, y = var_1820_to_fp16)[name = tensor<string, []>("q_with_bias_v_9_cast_fp16")];
             tensor<string, []> var_1830_pad_type_0 = const()[name = tensor<string, []>("op_1830_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1830_strides_0 = const()[name = tensor<string, []>("op_1830_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1830_pad_0 = const()[name = tensor<string, []>("op_1830_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1830_dilations_0 = const()[name = tensor<string, []>("op_1830_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1830_groups_0 = const()[name = tensor<string, []>("op_1830_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(78524160))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79048512))), name = tensor<string, []>("layers_4_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(107625344))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(108411840))), name = tensor<string, []>("layers_4_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_1830_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1830_dilations_0, groups = var_1830_groups_0, pad = var_1830_pad_0, pad_type = var_1830_pad_type_0, strides = var_1830_strides_0, weight = layers_4_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_1830_cast_fp16")];
             tensor<string, []> var_1836_pad_type_0 = const()[name = tensor<string, []>("op_1836_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1836_strides_0 = const()[name = tensor<string, []>("op_1836_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1836_pad_0 = const()[name = tensor<string, []>("op_1836_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1836_dilations_0 = const()[name = tensor<string, []>("op_1836_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1836_groups_0 = const()[name = tensor<string, []>("op_1836_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79125888))), name = tensor<string, []>("layers_4_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [38591]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79048640))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(108489280))), name = tensor<string, []>("layers_4_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [38591]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(108412032))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_1836_cast_fp16 = conv(dilations = var_1836_dilations_0, groups = var_1836_groups_0, pad = var_1836_pad_0, pad_type = var_1836_pad_type_0, strides = var_1836_strides_0, weight = layers_4_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_1836_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_9_cast_fp16 = add(x = var_1830_cast_fp16, y = var_1836_cast_fp16)[name = tensor<string, []>("p_9_cast_fp16")];
             tensor<int32, [4]> var_1840 = const()[name = tensor<string, []>("op_1840"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -1320,22 +1320,22 @@ program(1.0)
             tensor<int32, [4]> var_1893_pad_0 = const()[name = tensor<string, []>("op_1893_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1893_dilations_0 = const()[name = tensor<string, []>("op_1893_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1893_groups_0 = const()[name = tensor<string, []>("op_1893_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79257024))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79781376))), name = tensor<string, []>("layers_4_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(108620416))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109406912))), name = tensor<string, []>("layers_4_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1893_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_1893_dilations_0, groups = var_1893_groups_0, pad = var_1893_pad_0, pad_type = var_1893_pad_type_0, strides = var_1893_strides_0, weight = layers_4_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_127_cast_fp16)[name = tensor<string, []>("op_1893_cast_fp16")];
             tensor<string, []> var_1899_pad_type_0 = const()[name = tensor<string, []>("op_1899_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1899_strides_0 = const()[name = tensor<string, []>("op_1899_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1899_pad_0 = const()[name = tensor<string, []>("op_1899_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1899_dilations_0 = const()[name = tensor<string, []>("op_1899_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1899_groups_0 = const()[name = tensor<string, []>("op_1899_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79813056))), name = tensor<string, []>("layers_4_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15738]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79781504))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109438656))), name = tensor<string, []>("layers_4_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15738]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109407104))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1899_cast_fp16 = conv(dilations = var_1899_dilations_0, groups = var_1899_groups_0, pad = var_1899_pad_0, pad_type = var_1899_pad_type_0, strides = var_1899_strides_0, weight = layers_4_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_127_cast_fp16)[name = tensor<string, []>("op_1899_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_21_cast_fp16 = add(x = var_1893_cast_fp16, y = var_1899_cast_fp16)[name = tensor<string, []>("obj_21_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_45_cast_fp16 = add(x = inputs_43_cast_fp16, y = obj_21_cast_fp16)[name = tensor<string, []>("inputs_45_cast_fp16")];
             tensor<int32, [1]> out_45_axes_0 = const()[name = tensor<string, []>("out_45_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1910_to_fp16 = const()[name = tensor<string, []>("op_1910_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_45_cast_fp16 = layer_norm(axes = out_45_axes_0, epsilon = var_1910_to_fp16, x = inputs_45_cast_fp16)[name = tensor<string, []>("out_45_cast_fp16")];
-            tensor<fp16, [1024]> input_129_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_129_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79944192)))];
-            tensor<fp16, [1024]> input_129_beta_0_to_fp16 = const()[name = tensor<string, []>("input_129_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79946304)))];
+            tensor<fp16, [1024]> input_129_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_129_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109569792)))];
+            tensor<fp16, [1024]> input_129_beta_0_to_fp16 = const()[name = tensor<string, []>("input_129_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109571904)))];
             tensor<fp16, []> input_129_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_129_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_129_cast_fp16 = batch_norm(beta = input_129_beta_0_to_fp16, epsilon = input_129_epsilon_0_to_fp16, gamma = input_129_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_45_cast_fp16)[name = tensor<string, []>("input_129_cast_fp16")];
             tensor<string, []> var_1931_pad_type_0 = const()[name = tensor<string, []>("op_1931_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1343,14 +1343,14 @@ program(1.0)
             tensor<int32, [4]> var_1931_pad_0 = const()[name = tensor<string, []>("op_1931_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1931_dilations_0 = const()[name = tensor<string, []>("op_1931_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1931_groups_0 = const()[name = tensor<string, []>("op_1931_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_4_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(79948416))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(80997056))), name = tensor<string, []>("layers_4_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_4_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109574016))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111146944))), name = tensor<string, []>("layers_4_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_1931_cast_fp16 = conv(dilations = var_1931_dilations_0, groups = var_1931_groups_0, pad = var_1931_pad_0, pad_type = var_1931_pad_type_0, strides = var_1931_strides_0, weight = layers_4_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_129_cast_fp16)[name = tensor<string, []>("op_1931_cast_fp16")];
             tensor<string, []> var_1937_pad_type_0 = const()[name = tensor<string, []>("op_1937_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1937_strides_0 = const()[name = tensor<string, []>("op_1937_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1937_pad_0 = const()[name = tensor<string, []>("op_1937_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1937_dilations_0 = const()[name = tensor<string, []>("op_1937_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1937_groups_0 = const()[name = tensor<string, []>("op_1937_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_4_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81062400))), name = tensor<string, []>("layers_4_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32564]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(80997184))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_4_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111212352))), name = tensor<string, []>("layers_4_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32564]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111147136))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_1937_cast_fp16 = conv(dilations = var_1937_dilations_0, groups = var_1937_groups_0, pad = var_1937_pad_0, pad_type = var_1937_pad_type_0, strides = var_1937_strides_0, weight = layers_4_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_129_cast_fp16)[name = tensor<string, []>("op_1937_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_131_cast_fp16 = add(x = var_1931_cast_fp16, y = var_1937_cast_fp16)[name = tensor<string, []>("input_131_cast_fp16")];
             tensor<int32, []> input_133_split_num_splits_0 = const()[name = tensor<string, []>("input_133_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -1363,8 +1363,8 @@ program(1.0)
             tensor<int32, []> input_135_groups_0 = const()[name = tensor<string, []>("input_135_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_135_strides_0 = const()[name = tensor<string, []>("input_135_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_135_dilations_0 = const()[name = tensor<string, []>("input_135_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_276_to_fp16 = const()[name = tensor<string, []>("const_276_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81324608)))];
-            tensor<fp16, [1024]> const_277_to_fp16 = const()[name = tensor<string, []>("const_277_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81343104)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_276_to_fp16 = const()[name = tensor<string, []>("const_276_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111474560)))];
+            tensor<fp16, [1024]> const_277_to_fp16 = const()[name = tensor<string, []>("const_277_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111493056)))];
             tensor<fp16, [1, 1024, 1, 188]> input_137_cast_fp16 = conv(bias = const_277_to_fp16, dilations = input_135_dilations_0, groups = input_135_groups_0, pad = input_135_pad_0, pad_type = input_135_pad_type_0, strides = input_135_strides_0, weight = const_276_to_fp16, x = input_133_cast_fp16)[name = tensor<string, []>("input_137_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_139_cast_fp16 = silu(x = input_137_cast_fp16)[name = tensor<string, []>("input_139_cast_fp16")];
             tensor<string, []> var_1959_pad_type_0 = const()[name = tensor<string, []>("op_1959_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1372,22 +1372,22 @@ program(1.0)
             tensor<int32, [4]> var_1959_pad_0 = const()[name = tensor<string, []>("op_1959_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1959_dilations_0 = const()[name = tensor<string, []>("op_1959_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1959_groups_0 = const()[name = tensor<string, []>("op_1959_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81345216))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81869568))), name = tensor<string, []>("layers_4_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111495168))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112281664))), name = tensor<string, []>("layers_4_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1959_cast_fp16 = conv(dilations = var_1959_dilations_0, groups = var_1959_groups_0, pad = var_1959_pad_0, pad_type = var_1959_pad_type_0, strides = var_1959_strides_0, weight = layers_4_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_139_cast_fp16)[name = tensor<string, []>("op_1959_cast_fp16")];
             tensor<string, []> var_1965_pad_type_0 = const()[name = tensor<string, []>("op_1965_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_1965_strides_0 = const()[name = tensor<string, []>("op_1965_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_1965_pad_0 = const()[name = tensor<string, []>("op_1965_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1965_dilations_0 = const()[name = tensor<string, []>("op_1965_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1965_groups_0 = const()[name = tensor<string, []>("op_1965_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_4_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81902464))), name = tensor<string, []>("layers_4_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16338]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(81869696))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_4_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112314624))), name = tensor<string, []>("layers_4_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16338]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112281856))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_1965_cast_fp16 = conv(dilations = var_1965_dilations_0, groups = var_1965_groups_0, pad = var_1965_pad_0, pad_type = var_1965_pad_type_0, strides = var_1965_strides_0, weight = layers_4_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_139_cast_fp16)[name = tensor<string, []>("op_1965_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_29_cast_fp16 = add(x = var_1959_cast_fp16, y = var_1965_cast_fp16)[name = tensor<string, []>("x_29_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_47_cast_fp16 = add(x = inputs_45_cast_fp16, y = x_29_cast_fp16)[name = tensor<string, []>("inputs_47_cast_fp16")];
             tensor<int32, [1]> out_47_axes_0 = const()[name = tensor<string, []>("out_47_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_1976_to_fp16 = const()[name = tensor<string, []>("op_1976_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_47_cast_fp16 = layer_norm(axes = out_47_axes_0, epsilon = var_1976_to_fp16, x = inputs_47_cast_fp16)[name = tensor<string, []>("out_47_cast_fp16")];
-            tensor<fp16, [1024]> input_141_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_141_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(82033600)))];
-            tensor<fp16, [1024]> input_141_beta_0_to_fp16 = const()[name = tensor<string, []>("input_141_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(82035712)))];
+            tensor<fp16, [1024]> input_141_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_141_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112445760)))];
+            tensor<fp16, [1024]> input_141_beta_0_to_fp16 = const()[name = tensor<string, []>("input_141_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112447872)))];
             tensor<fp16, []> input_141_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_141_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_141_cast_fp16 = batch_norm(beta = input_141_beta_0_to_fp16, epsilon = input_141_epsilon_0_to_fp16, gamma = input_141_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_47_cast_fp16)[name = tensor<string, []>("input_141_cast_fp16")];
             tensor<string, []> var_1996_pad_type_0 = const()[name = tensor<string, []>("op_1996_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1395,14 +1395,14 @@ program(1.0)
             tensor<int32, [4]> var_1996_pad_0 = const()[name = tensor<string, []>("op_1996_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_1996_dilations_0 = const()[name = tensor<string, []>("op_1996_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_1996_groups_0 = const()[name = tensor<string, []>("op_1996_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_4_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(82037824))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84135040))), name = tensor<string, []>("layers_4_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_4_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112449984))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(115595776))), name = tensor<string, []>("layers_4_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_1996_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_1996_dilations_0, groups = var_1996_groups_0, pad = var_1996_pad_0, pad_type = var_1996_pad_type_0, strides = var_1996_strides_0, weight = layers_4_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_141_cast_fp16)[name = tensor<string, []>("op_1996_cast_fp16")];
             tensor<string, []> var_2002_pad_type_0 = const()[name = tensor<string, []>("op_2002_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2002_strides_0 = const()[name = tensor<string, []>("op_2002_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2002_pad_0 = const()[name = tensor<string, []>("op_2002_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2002_dilations_0 = const()[name = tensor<string, []>("op_2002_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2002_groups_0 = const()[name = tensor<string, []>("op_2002_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_4_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84301184))), name = tensor<string, []>("layers_4_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82964]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84135168))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_4_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(115761984))), name = tensor<string, []>("layers_4_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82964]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(115595968))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2002_cast_fp16 = conv(dilations = var_2002_dilations_0, groups = var_2002_groups_0, pad = var_2002_pad_0, pad_type = var_2002_pad_type_0, strides = var_2002_strides_0, weight = layers_4_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_141_cast_fp16)[name = tensor<string, []>("op_2002_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_143_cast_fp16 = add(x = var_1996_cast_fp16, y = var_2002_cast_fp16)[name = tensor<string, []>("input_143_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_145_cast_fp16 = silu(x = input_143_cast_fp16)[name = tensor<string, []>("input_145_cast_fp16")];
@@ -1411,14 +1411,14 @@ program(1.0)
             tensor<int32, [4]> var_2013_pad_0 = const()[name = tensor<string, []>("op_2013_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2013_dilations_0 = const()[name = tensor<string, []>("op_2013_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2013_groups_0 = const()[name = tensor<string, []>("op_2013_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_4_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(84825536))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(86922752))), name = tensor<string, []>("layers_4_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_4_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(116286336))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(119432128))), name = tensor<string, []>("layers_4_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2013_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2013_dilations_0, groups = var_2013_groups_0, pad = var_2013_pad_0, pad_type = var_2013_pad_type_0, strides = var_2013_strides_0, weight = layers_4_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_145_cast_fp16)[name = tensor<string, []>("op_2013_cast_fp16")];
             tensor<string, []> var_2019_pad_type_0 = const()[name = tensor<string, []>("op_2019_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2019_strides_0 = const()[name = tensor<string, []>("op_2019_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2019_pad_0 = const()[name = tensor<string, []>("op_2019_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2019_dilations_0 = const()[name = tensor<string, []>("op_2019_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2019_groups_0 = const()[name = tensor<string, []>("op_2019_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_4_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(87111424))), name = tensor<string, []>("layers_4_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [94214]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(86922880))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_4_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(119620864))), name = tensor<string, []>("layers_4_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [94214]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(119432320))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2019_cast_fp16 = conv(dilations = var_2019_dilations_0, groups = var_2019_groups_0, pad = var_2019_pad_0, pad_type = var_2019_pad_type_0, strides = var_2019_strides_0, weight = layers_4_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_145_cast_fp16)[name = tensor<string, []>("op_2019_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_31_cast_fp16 = add(x = var_2013_cast_fp16, y = var_2019_cast_fp16)[name = tensor<string, []>("x_31_cast_fp16")];
             tensor<fp16, []> var_2021_to_fp16 = const()[name = tensor<string, []>("op_2021_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -1427,16 +1427,16 @@ program(1.0)
             tensor<int32, [1]> out_49_axes_0 = const()[name = tensor<string, []>("out_49_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2032_to_fp16 = const()[name = tensor<string, []>("op_2032_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_49_cast_fp16 = layer_norm(axes = out_49_axes_0, epsilon = var_2032_to_fp16, x = inputs_49_cast_fp16)[name = tensor<string, []>("out_49_cast_fp16")];
-            tensor<fp16, [1024]> inputs_51_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_51_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(87635776)))];
-            tensor<fp16, [1024]> inputs_51_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_51_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(87637888)))];
+            tensor<fp16, [1024]> inputs_51_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_51_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(120145216)))];
+            tensor<fp16, [1024]> inputs_51_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_51_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(120147328)))];
             tensor<fp16, []> inputs_51_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_51_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_51_cast_fp16 = batch_norm(beta = inputs_51_beta_0_to_fp16, epsilon = inputs_51_epsilon_0_to_fp16, gamma = inputs_51_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_49_cast_fp16)[name = tensor<string, []>("inputs_51_cast_fp16")];
             tensor<int32, []> var_2046 = const()[name = tensor<string, []>("op_2046"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_51_axes_0 = const()[name = tensor<string, []>("out_51_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2077_to_fp16 = const()[name = tensor<string, []>("op_2077_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_51_cast_fp16 = layer_norm(axes = out_51_axes_0, epsilon = var_2077_to_fp16, x = inputs_51_cast_fp16)[name = tensor<string, []>("out_51_cast_fp16")];
-            tensor<fp16, [1024]> input_147_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_147_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(87640000)))];
-            tensor<fp16, [1024]> input_147_beta_0_to_fp16 = const()[name = tensor<string, []>("input_147_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(87642112)))];
+            tensor<fp16, [1024]> input_147_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_147_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(120149440)))];
+            tensor<fp16, [1024]> input_147_beta_0_to_fp16 = const()[name = tensor<string, []>("input_147_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(120151552)))];
             tensor<fp16, []> input_147_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_147_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_147_cast_fp16 = batch_norm(beta = input_147_beta_0_to_fp16, epsilon = input_147_epsilon_0_to_fp16, gamma = input_147_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_51_cast_fp16)[name = tensor<string, []>("input_147_cast_fp16")];
             tensor<string, []> var_2097_pad_type_0 = const()[name = tensor<string, []>("op_2097_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1444,14 +1444,14 @@ program(1.0)
             tensor<int32, [4]> var_2097_pad_0 = const()[name = tensor<string, []>("op_2097_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2097_dilations_0 = const()[name = tensor<string, []>("op_2097_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2097_groups_0 = const()[name = tensor<string, []>("op_2097_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_5_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(87644224))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(89741440))), name = tensor<string, []>("layers_5_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_5_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(120153664))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(123299456))), name = tensor<string, []>("layers_5_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2097_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_2097_dilations_0, groups = var_2097_groups_0, pad = var_2097_pad_0, pad_type = var_2097_pad_type_0, strides = var_2097_strides_0, weight = layers_5_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_147_cast_fp16)[name = tensor<string, []>("op_2097_cast_fp16")];
             tensor<string, []> var_2103_pad_type_0 = const()[name = tensor<string, []>("op_2103_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2103_strides_0 = const()[name = tensor<string, []>("op_2103_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2103_pad_0 = const()[name = tensor<string, []>("op_2103_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2103_dilations_0 = const()[name = tensor<string, []>("op_2103_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2103_groups_0 = const()[name = tensor<string, []>("op_2103_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_5_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(89881408))), name = tensor<string, []>("layers_5_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69880]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(89741568))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_5_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(123439488))), name = tensor<string, []>("layers_5_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69880]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(123299648))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2103_cast_fp16 = conv(dilations = var_2103_dilations_0, groups = var_2103_groups_0, pad = var_2103_pad_0, pad_type = var_2103_pad_type_0, strides = var_2103_strides_0, weight = layers_5_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_147_cast_fp16)[name = tensor<string, []>("op_2103_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_149_cast_fp16 = add(x = var_2097_cast_fp16, y = var_2103_cast_fp16)[name = tensor<string, []>("input_149_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_151_cast_fp16 = silu(x = input_149_cast_fp16)[name = tensor<string, []>("input_151_cast_fp16")];
@@ -1460,14 +1460,14 @@ program(1.0)
             tensor<int32, [4]> var_2114_pad_0 = const()[name = tensor<string, []>("op_2114_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2114_dilations_0 = const()[name = tensor<string, []>("op_2114_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2114_groups_0 = const()[name = tensor<string, []>("op_2114_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_5_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(90405760))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(92502976))), name = tensor<string, []>("layers_5_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_5_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(123963840))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127109632))), name = tensor<string, []>("layers_5_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2114_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2114_dilations_0, groups = var_2114_groups_0, pad = var_2114_pad_0, pad_type = var_2114_pad_type_0, strides = var_2114_strides_0, weight = layers_5_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_151_cast_fp16)[name = tensor<string, []>("op_2114_cast_fp16")];
             tensor<string, []> var_2120_pad_type_0 = const()[name = tensor<string, []>("op_2120_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2120_strides_0 = const()[name = tensor<string, []>("op_2120_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2120_pad_0 = const()[name = tensor<string, []>("op_2120_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2120_dilations_0 = const()[name = tensor<string, []>("op_2120_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2120_groups_0 = const()[name = tensor<string, []>("op_2120_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_5_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(92684352))), name = tensor<string, []>("layers_5_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [90588]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(92503104))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_5_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127291072))), name = tensor<string, []>("layers_5_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [90588]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127109824))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2120_cast_fp16 = conv(dilations = var_2120_dilations_0, groups = var_2120_groups_0, pad = var_2120_pad_0, pad_type = var_2120_pad_type_0, strides = var_2120_strides_0, weight = layers_5_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_151_cast_fp16)[name = tensor<string, []>("op_2120_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_33_cast_fp16 = add(x = var_2114_cast_fp16, y = var_2120_cast_fp16)[name = tensor<string, []>("x_33_cast_fp16")];
             tensor<fp16, []> var_2122_to_fp16 = const()[name = tensor<string, []>("op_2122_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -1476,8 +1476,8 @@ program(1.0)
             tensor<int32, [1]> out_53_axes_0 = const()[name = tensor<string, []>("out_53_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2133_to_fp16 = const()[name = tensor<string, []>("op_2133_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_53_cast_fp16 = layer_norm(axes = out_53_axes_0, epsilon = var_2133_to_fp16, x = inputs_53_cast_fp16)[name = tensor<string, []>("out_53_cast_fp16")];
-            tensor<fp16, [1024]> obj_23_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_23_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(93208704)))];
-            tensor<fp16, [1024]> obj_23_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_23_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(93210816)))];
+            tensor<fp16, [1024]> obj_23_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_23_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127815424)))];
+            tensor<fp16, [1024]> obj_23_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_23_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127817536)))];
             tensor<fp16, []> obj_23_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_23_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_23_cast_fp16 = batch_norm(beta = obj_23_beta_0_to_fp16, epsilon = obj_23_epsilon_0_to_fp16, gamma = obj_23_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_53_cast_fp16)[name = tensor<string, []>("obj_23_cast_fp16")];
             tensor<string, []> var_2158_pad_type_0 = const()[name = tensor<string, []>("op_2158_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1485,14 +1485,14 @@ program(1.0)
             tensor<int32, [4]> var_2158_pad_0 = const()[name = tensor<string, []>("op_2158_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2158_dilations_0 = const()[name = tensor<string, []>("op_2158_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2158_groups_0 = const()[name = tensor<string, []>("op_2158_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(93212928))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(93737280))), name = tensor<string, []>("layers_5_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127819648))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128606144))), name = tensor<string, []>("layers_5_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2158_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2158_dilations_0, groups = var_2158_groups_0, pad = var_2158_pad_0, pad_type = var_2158_pad_type_0, strides = var_2158_strides_0, weight = layers_5_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_23_cast_fp16)[name = tensor<string, []>("op_2158_cast_fp16")];
             tensor<string, []> var_2164_pad_type_0 = const()[name = tensor<string, []>("op_2164_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2164_strides_0 = const()[name = tensor<string, []>("op_2164_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2164_pad_0 = const()[name = tensor<string, []>("op_2164_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2164_dilations_0 = const()[name = tensor<string, []>("op_2164_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2164_groups_0 = const()[name = tensor<string, []>("op_2164_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(93772672))), name = tensor<string, []>("layers_5_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17587]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(93737408))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128641600))), name = tensor<string, []>("layers_5_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17587]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128606336))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2164_cast_fp16 = conv(dilations = var_2164_dilations_0, groups = var_2164_groups_0, pad = var_2164_pad_0, pad_type = var_2164_pad_type_0, strides = var_2164_strides_0, weight = layers_5_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_23_cast_fp16)[name = tensor<string, []>("op_2164_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_21_cast_fp16 = add(x = var_2158_cast_fp16, y = var_2164_cast_fp16)[name = tensor<string, []>("query_21_cast_fp16")];
             tensor<string, []> var_2173_pad_type_0 = const()[name = tensor<string, []>("op_2173_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1500,14 +1500,14 @@ program(1.0)
             tensor<int32, [4]> var_2173_pad_0 = const()[name = tensor<string, []>("op_2173_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2173_dilations_0 = const()[name = tensor<string, []>("op_2173_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2173_groups_0 = const()[name = tensor<string, []>("op_2173_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(93903808))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(94428160))), name = tensor<string, []>("layers_5_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128772736))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129559232))), name = tensor<string, []>("layers_5_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2173_cast_fp16 = conv(dilations = var_2173_dilations_0, groups = var_2173_groups_0, pad = var_2173_pad_0, pad_type = var_2173_pad_type_0, strides = var_2173_strides_0, weight = layers_5_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_23_cast_fp16)[name = tensor<string, []>("op_2173_cast_fp16")];
             tensor<string, []> var_2179_pad_type_0 = const()[name = tensor<string, []>("op_2179_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2179_strides_0 = const()[name = tensor<string, []>("op_2179_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2179_pad_0 = const()[name = tensor<string, []>("op_2179_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2179_dilations_0 = const()[name = tensor<string, []>("op_2179_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2179_groups_0 = const()[name = tensor<string, []>("op_2179_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(94469056))), name = tensor<string, []>("layers_5_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [20330]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(94428288))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129600192))), name = tensor<string, []>("layers_5_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [20330]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129559424))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2179_cast_fp16 = conv(dilations = var_2179_dilations_0, groups = var_2179_groups_0, pad = var_2179_pad_0, pad_type = var_2179_pad_type_0, strides = var_2179_strides_0, weight = layers_5_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_23_cast_fp16)[name = tensor<string, []>("op_2179_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_11_cast_fp16 = add(x = var_2173_cast_fp16, y = var_2179_cast_fp16)[name = tensor<string, []>("key_11_cast_fp16")];
             tensor<string, []> var_2189_pad_type_0 = const()[name = tensor<string, []>("op_2189_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1515,33 +1515,33 @@ program(1.0)
             tensor<int32, [4]> var_2189_pad_0 = const()[name = tensor<string, []>("op_2189_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2189_dilations_0 = const()[name = tensor<string, []>("op_2189_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2189_groups_0 = const()[name = tensor<string, []>("op_2189_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(94600192))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(95124544))), name = tensor<string, []>("layers_5_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129731328))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130517824))), name = tensor<string, []>("layers_5_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2189_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2189_dilations_0, groups = var_2189_groups_0, pad = var_2189_pad_0, pad_type = var_2189_pad_type_0, strides = var_2189_strides_0, weight = layers_5_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_23_cast_fp16)[name = tensor<string, []>("op_2189_cast_fp16")];
             tensor<string, []> var_2195_pad_type_0 = const()[name = tensor<string, []>("op_2195_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2195_strides_0 = const()[name = tensor<string, []>("op_2195_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2195_pad_0 = const()[name = tensor<string, []>("op_2195_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2195_dilations_0 = const()[name = tensor<string, []>("op_2195_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2195_groups_0 = const()[name = tensor<string, []>("op_2195_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(95157888))), name = tensor<string, []>("layers_5_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16558]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(95124672))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130551232))), name = tensor<string, []>("layers_5_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16558]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130518016))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2195_cast_fp16 = conv(dilations = var_2195_dilations_0, groups = var_2195_groups_0, pad = var_2195_pad_0, pad_type = var_2195_pad_type_0, strides = var_2195_strides_0, weight = layers_5_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_23_cast_fp16)[name = tensor<string, []>("op_2195_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_11_cast_fp16 = add(x = var_2189_cast_fp16, y = var_2195_cast_fp16)[name = tensor<string, []>("value_11_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_2198_to_fp16 = const()[name = tensor<string, []>("op_2198_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(95289024)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_2198_to_fp16 = const()[name = tensor<string, []>("op_2198_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130682368)))];
             tensor<fp16, [1, 1024, 1, 188]> query_23_cast_fp16 = add(x = query_21_cast_fp16, y = var_2198_to_fp16)[name = tensor<string, []>("query_23_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_2201_to_fp16 = const()[name = tensor<string, []>("op_2201_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(95291136)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_2201_to_fp16 = const()[name = tensor<string, []>("op_2201_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130684480)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_11_cast_fp16 = add(x = query_21_cast_fp16, y = var_2201_to_fp16)[name = tensor<string, []>("q_with_bias_v_11_cast_fp16")];
             tensor<string, []> var_2211_pad_type_0 = const()[name = tensor<string, []>("op_2211_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2211_strides_0 = const()[name = tensor<string, []>("op_2211_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2211_pad_0 = const()[name = tensor<string, []>("op_2211_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2211_dilations_0 = const()[name = tensor<string, []>("op_2211_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2211_groups_0 = const()[name = tensor<string, []>("op_2211_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(95293248))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(95817600))), name = tensor<string, []>("layers_5_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130686592))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131473088))), name = tensor<string, []>("layers_5_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_2211_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2211_dilations_0, groups = var_2211_groups_0, pad = var_2211_pad_0, pad_type = var_2211_pad_type_0, strides = var_2211_strides_0, weight = layers_5_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_2211_cast_fp16")];
             tensor<string, []> var_2217_pad_type_0 = const()[name = tensor<string, []>("op_2217_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2217_strides_0 = const()[name = tensor<string, []>("op_2217_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2217_pad_0 = const()[name = tensor<string, []>("op_2217_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2217_dilations_0 = const()[name = tensor<string, []>("op_2217_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2217_groups_0 = const()[name = tensor<string, []>("op_2217_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(95894784))), name = tensor<string, []>("layers_5_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [38475]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(95817728))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131550336))), name = tensor<string, []>("layers_5_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [38475]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131473280))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_2217_cast_fp16 = conv(dilations = var_2217_dilations_0, groups = var_2217_groups_0, pad = var_2217_pad_0, pad_type = var_2217_pad_type_0, strides = var_2217_strides_0, weight = layers_5_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_2217_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_11_cast_fp16 = add(x = var_2211_cast_fp16, y = var_2217_cast_fp16)[name = tensor<string, []>("p_11_cast_fp16")];
             tensor<int32, [4]> var_2221 = const()[name = tensor<string, []>("op_2221"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -1592,22 +1592,22 @@ program(1.0)
             tensor<int32, [4]> var_2274_pad_0 = const()[name = tensor<string, []>("op_2274_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2274_dilations_0 = const()[name = tensor<string, []>("op_2274_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2274_groups_0 = const()[name = tensor<string, []>("op_2274_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96025920))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96550272))), name = tensor<string, []>("layers_5_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131681472))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132467968))), name = tensor<string, []>("layers_5_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2274_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2274_dilations_0, groups = var_2274_groups_0, pad = var_2274_pad_0, pad_type = var_2274_pad_type_0, strides = var_2274_strides_0, weight = layers_5_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_153_cast_fp16)[name = tensor<string, []>("op_2274_cast_fp16")];
             tensor<string, []> var_2280_pad_type_0 = const()[name = tensor<string, []>("op_2280_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2280_strides_0 = const()[name = tensor<string, []>("op_2280_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2280_pad_0 = const()[name = tensor<string, []>("op_2280_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2280_dilations_0 = const()[name = tensor<string, []>("op_2280_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2280_groups_0 = const()[name = tensor<string, []>("op_2280_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96581952))), name = tensor<string, []>("layers_5_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15724]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96550400))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132499712))), name = tensor<string, []>("layers_5_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15724]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132468160))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2280_cast_fp16 = conv(dilations = var_2280_dilations_0, groups = var_2280_groups_0, pad = var_2280_pad_0, pad_type = var_2280_pad_type_0, strides = var_2280_strides_0, weight = layers_5_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_153_cast_fp16)[name = tensor<string, []>("op_2280_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_25_cast_fp16 = add(x = var_2274_cast_fp16, y = var_2280_cast_fp16)[name = tensor<string, []>("obj_25_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_55_cast_fp16 = add(x = inputs_53_cast_fp16, y = obj_25_cast_fp16)[name = tensor<string, []>("inputs_55_cast_fp16")];
             tensor<int32, [1]> out_55_axes_0 = const()[name = tensor<string, []>("out_55_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2291_to_fp16 = const()[name = tensor<string, []>("op_2291_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_55_cast_fp16 = layer_norm(axes = out_55_axes_0, epsilon = var_2291_to_fp16, x = inputs_55_cast_fp16)[name = tensor<string, []>("out_55_cast_fp16")];
-            tensor<fp16, [1024]> input_155_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_155_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96713088)))];
-            tensor<fp16, [1024]> input_155_beta_0_to_fp16 = const()[name = tensor<string, []>("input_155_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96715200)))];
+            tensor<fp16, [1024]> input_155_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_155_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132630848)))];
+            tensor<fp16, [1024]> input_155_beta_0_to_fp16 = const()[name = tensor<string, []>("input_155_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132632960)))];
             tensor<fp16, []> input_155_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_155_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_155_cast_fp16 = batch_norm(beta = input_155_beta_0_to_fp16, epsilon = input_155_epsilon_0_to_fp16, gamma = input_155_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_55_cast_fp16)[name = tensor<string, []>("input_155_cast_fp16")];
             tensor<string, []> var_2312_pad_type_0 = const()[name = tensor<string, []>("op_2312_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1615,14 +1615,14 @@ program(1.0)
             tensor<int32, [4]> var_2312_pad_0 = const()[name = tensor<string, []>("op_2312_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2312_dilations_0 = const()[name = tensor<string, []>("op_2312_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2312_groups_0 = const()[name = tensor<string, []>("op_2312_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_5_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(96717312))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(97765952))), name = tensor<string, []>("layers_5_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_5_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132635072))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(134208000))), name = tensor<string, []>("layers_5_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_2312_cast_fp16 = conv(dilations = var_2312_dilations_0, groups = var_2312_groups_0, pad = var_2312_pad_0, pad_type = var_2312_pad_type_0, strides = var_2312_strides_0, weight = layers_5_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_155_cast_fp16)[name = tensor<string, []>("op_2312_cast_fp16")];
             tensor<string, []> var_2318_pad_type_0 = const()[name = tensor<string, []>("op_2318_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2318_strides_0 = const()[name = tensor<string, []>("op_2318_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2318_pad_0 = const()[name = tensor<string, []>("op_2318_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2318_dilations_0 = const()[name = tensor<string, []>("op_2318_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2318_groups_0 = const()[name = tensor<string, []>("op_2318_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_5_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(97830016))), name = tensor<string, []>("layers_5_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31935]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(97766080))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_5_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(134272128))), name = tensor<string, []>("layers_5_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31935]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(134208192))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_2318_cast_fp16 = conv(dilations = var_2318_dilations_0, groups = var_2318_groups_0, pad = var_2318_pad_0, pad_type = var_2318_pad_type_0, strides = var_2318_strides_0, weight = layers_5_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_155_cast_fp16)[name = tensor<string, []>("op_2318_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_157_cast_fp16 = add(x = var_2312_cast_fp16, y = var_2318_cast_fp16)[name = tensor<string, []>("input_157_cast_fp16")];
             tensor<int32, []> input_159_split_num_splits_0 = const()[name = tensor<string, []>("input_159_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -1635,8 +1635,8 @@ program(1.0)
             tensor<int32, []> input_161_groups_0 = const()[name = tensor<string, []>("input_161_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_161_strides_0 = const()[name = tensor<string, []>("input_161_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_161_dilations_0 = const()[name = tensor<string, []>("input_161_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_278_to_fp16 = const()[name = tensor<string, []>("const_278_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(98092224)))];
-            tensor<fp16, [1024]> const_279_to_fp16 = const()[name = tensor<string, []>("const_279_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(98110720)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_278_to_fp16 = const()[name = tensor<string, []>("const_278_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(134534336)))];
+            tensor<fp16, [1024]> const_279_to_fp16 = const()[name = tensor<string, []>("const_279_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(134552832)))];
             tensor<fp16, [1, 1024, 1, 188]> input_163_cast_fp16 = conv(bias = const_279_to_fp16, dilations = input_161_dilations_0, groups = input_161_groups_0, pad = input_161_pad_0, pad_type = input_161_pad_type_0, strides = input_161_strides_0, weight = const_278_to_fp16, x = input_159_cast_fp16)[name = tensor<string, []>("input_163_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_165_cast_fp16 = silu(x = input_163_cast_fp16)[name = tensor<string, []>("input_165_cast_fp16")];
             tensor<string, []> var_2340_pad_type_0 = const()[name = tensor<string, []>("op_2340_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1644,22 +1644,22 @@ program(1.0)
             tensor<int32, [4]> var_2340_pad_0 = const()[name = tensor<string, []>("op_2340_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2340_dilations_0 = const()[name = tensor<string, []>("op_2340_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2340_groups_0 = const()[name = tensor<string, []>("op_2340_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(98112832))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(98637184))), name = tensor<string, []>("layers_5_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(134554944))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(135341440))), name = tensor<string, []>("layers_5_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2340_cast_fp16 = conv(dilations = var_2340_dilations_0, groups = var_2340_groups_0, pad = var_2340_pad_0, pad_type = var_2340_pad_type_0, strides = var_2340_strides_0, weight = layers_5_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_165_cast_fp16)[name = tensor<string, []>("op_2340_cast_fp16")];
             tensor<string, []> var_2346_pad_type_0 = const()[name = tensor<string, []>("op_2346_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2346_strides_0 = const()[name = tensor<string, []>("op_2346_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2346_pad_0 = const()[name = tensor<string, []>("op_2346_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2346_dilations_0 = const()[name = tensor<string, []>("op_2346_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2346_groups_0 = const()[name = tensor<string, []>("op_2346_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_5_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(98668992))), name = tensor<string, []>("layers_5_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15808]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(98637312))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_5_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(135373312))), name = tensor<string, []>("layers_5_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15808]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(135341632))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2346_cast_fp16 = conv(dilations = var_2346_dilations_0, groups = var_2346_groups_0, pad = var_2346_pad_0, pad_type = var_2346_pad_type_0, strides = var_2346_strides_0, weight = layers_5_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_165_cast_fp16)[name = tensor<string, []>("op_2346_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_35_cast_fp16 = add(x = var_2340_cast_fp16, y = var_2346_cast_fp16)[name = tensor<string, []>("x_35_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_57_cast_fp16 = add(x = inputs_55_cast_fp16, y = x_35_cast_fp16)[name = tensor<string, []>("inputs_57_cast_fp16")];
             tensor<int32, [1]> out_57_axes_0 = const()[name = tensor<string, []>("out_57_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2357_to_fp16 = const()[name = tensor<string, []>("op_2357_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_57_cast_fp16 = layer_norm(axes = out_57_axes_0, epsilon = var_2357_to_fp16, x = inputs_57_cast_fp16)[name = tensor<string, []>("out_57_cast_fp16")];
-            tensor<fp16, [1024]> input_167_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_167_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(98800128)))];
-            tensor<fp16, [1024]> input_167_beta_0_to_fp16 = const()[name = tensor<string, []>("input_167_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(98802240)))];
+            tensor<fp16, [1024]> input_167_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_167_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(135504448)))];
+            tensor<fp16, [1024]> input_167_beta_0_to_fp16 = const()[name = tensor<string, []>("input_167_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(135506560)))];
             tensor<fp16, []> input_167_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_167_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_167_cast_fp16 = batch_norm(beta = input_167_beta_0_to_fp16, epsilon = input_167_epsilon_0_to_fp16, gamma = input_167_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_57_cast_fp16)[name = tensor<string, []>("input_167_cast_fp16")];
             tensor<string, []> var_2377_pad_type_0 = const()[name = tensor<string, []>("op_2377_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1667,14 +1667,14 @@ program(1.0)
             tensor<int32, [4]> var_2377_pad_0 = const()[name = tensor<string, []>("op_2377_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2377_dilations_0 = const()[name = tensor<string, []>("op_2377_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2377_groups_0 = const()[name = tensor<string, []>("op_2377_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_5_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(98804352))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100901568))), name = tensor<string, []>("layers_5_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_5_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(135508672))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(138654464))), name = tensor<string, []>("layers_5_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2377_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_2377_dilations_0, groups = var_2377_groups_0, pad = var_2377_pad_0, pad_type = var_2377_pad_type_0, strides = var_2377_strides_0, weight = layers_5_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_167_cast_fp16)[name = tensor<string, []>("op_2377_cast_fp16")];
             tensor<string, []> var_2383_pad_type_0 = const()[name = tensor<string, []>("op_2383_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2383_strides_0 = const()[name = tensor<string, []>("op_2383_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2383_pad_0 = const()[name = tensor<string, []>("op_2383_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2383_dilations_0 = const()[name = tensor<string, []>("op_2383_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2383_groups_0 = const()[name = tensor<string, []>("op_2383_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_5_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(101054080))), name = tensor<string, []>("layers_5_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [76149]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(100901696))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_5_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(138807040))), name = tensor<string, []>("layers_5_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [76149]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(138654656))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2383_cast_fp16 = conv(dilations = var_2383_dilations_0, groups = var_2383_groups_0, pad = var_2383_pad_0, pad_type = var_2383_pad_type_0, strides = var_2383_strides_0, weight = layers_5_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_167_cast_fp16)[name = tensor<string, []>("op_2383_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_169_cast_fp16 = add(x = var_2377_cast_fp16, y = var_2383_cast_fp16)[name = tensor<string, []>("input_169_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_171_cast_fp16 = silu(x = input_169_cast_fp16)[name = tensor<string, []>("input_171_cast_fp16")];
@@ -1683,14 +1683,14 @@ program(1.0)
             tensor<int32, [4]> var_2394_pad_0 = const()[name = tensor<string, []>("op_2394_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2394_dilations_0 = const()[name = tensor<string, []>("op_2394_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2394_groups_0 = const()[name = tensor<string, []>("op_2394_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_5_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(101578432))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(103675648))), name = tensor<string, []>("layers_5_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_5_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(139331392))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(142477184))), name = tensor<string, []>("layers_5_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2394_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2394_dilations_0, groups = var_2394_groups_0, pad = var_2394_pad_0, pad_type = var_2394_pad_type_0, strides = var_2394_strides_0, weight = layers_5_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_171_cast_fp16)[name = tensor<string, []>("op_2394_cast_fp16")];
             tensor<string, []> var_2400_pad_type_0 = const()[name = tensor<string, []>("op_2400_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2400_strides_0 = const()[name = tensor<string, []>("op_2400_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2400_pad_0 = const()[name = tensor<string, []>("op_2400_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2400_dilations_0 = const()[name = tensor<string, []>("op_2400_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2400_groups_0 = const()[name = tensor<string, []>("op_2400_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_5_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(103840448))), name = tensor<string, []>("layers_5_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82291]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(103675776))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_5_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(142642048))), name = tensor<string, []>("layers_5_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82291]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(142477376))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2400_cast_fp16 = conv(dilations = var_2400_dilations_0, groups = var_2400_groups_0, pad = var_2400_pad_0, pad_type = var_2400_pad_type_0, strides = var_2400_strides_0, weight = layers_5_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_171_cast_fp16)[name = tensor<string, []>("op_2400_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_37_cast_fp16 = add(x = var_2394_cast_fp16, y = var_2400_cast_fp16)[name = tensor<string, []>("x_37_cast_fp16")];
             tensor<fp16, []> var_2402_to_fp16 = const()[name = tensor<string, []>("op_2402_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -1699,16 +1699,16 @@ program(1.0)
             tensor<int32, [1]> out_59_axes_0 = const()[name = tensor<string, []>("out_59_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2413_to_fp16 = const()[name = tensor<string, []>("op_2413_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_59_cast_fp16 = layer_norm(axes = out_59_axes_0, epsilon = var_2413_to_fp16, x = inputs_59_cast_fp16)[name = tensor<string, []>("out_59_cast_fp16")];
-            tensor<fp16, [1024]> inputs_61_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_61_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104364800)))];
-            tensor<fp16, [1024]> inputs_61_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_61_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104366912)))];
+            tensor<fp16, [1024]> inputs_61_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_61_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143166400)))];
+            tensor<fp16, [1024]> inputs_61_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_61_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143168512)))];
             tensor<fp16, []> inputs_61_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_61_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_61_cast_fp16 = batch_norm(beta = inputs_61_beta_0_to_fp16, epsilon = inputs_61_epsilon_0_to_fp16, gamma = inputs_61_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_59_cast_fp16)[name = tensor<string, []>("inputs_61_cast_fp16")];
             tensor<int32, []> var_2427 = const()[name = tensor<string, []>("op_2427"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_61_axes_0 = const()[name = tensor<string, []>("out_61_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2458_to_fp16 = const()[name = tensor<string, []>("op_2458_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_61_cast_fp16 = layer_norm(axes = out_61_axes_0, epsilon = var_2458_to_fp16, x = inputs_61_cast_fp16)[name = tensor<string, []>("out_61_cast_fp16")];
-            tensor<fp16, [1024]> input_173_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_173_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104369024)))];
-            tensor<fp16, [1024]> input_173_beta_0_to_fp16 = const()[name = tensor<string, []>("input_173_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104371136)))];
+            tensor<fp16, [1024]> input_173_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_173_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143170624)))];
+            tensor<fp16, [1024]> input_173_beta_0_to_fp16 = const()[name = tensor<string, []>("input_173_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143172736)))];
             tensor<fp16, []> input_173_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_173_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_173_cast_fp16 = batch_norm(beta = input_173_beta_0_to_fp16, epsilon = input_173_epsilon_0_to_fp16, gamma = input_173_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_61_cast_fp16)[name = tensor<string, []>("input_173_cast_fp16")];
             tensor<string, []> var_2478_pad_type_0 = const()[name = tensor<string, []>("op_2478_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1716,14 +1716,14 @@ program(1.0)
             tensor<int32, [4]> var_2478_pad_0 = const()[name = tensor<string, []>("op_2478_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2478_dilations_0 = const()[name = tensor<string, []>("op_2478_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2478_groups_0 = const()[name = tensor<string, []>("op_2478_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_6_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(104373248))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(106470464))), name = tensor<string, []>("layers_6_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_6_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143174848))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146320640))), name = tensor<string, []>("layers_6_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2478_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_2478_dilations_0, groups = var_2478_groups_0, pad = var_2478_pad_0, pad_type = var_2478_pad_type_0, strides = var_2478_strides_0, weight = layers_6_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_173_cast_fp16)[name = tensor<string, []>("op_2478_cast_fp16")];
             tensor<string, []> var_2484_pad_type_0 = const()[name = tensor<string, []>("op_2484_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2484_strides_0 = const()[name = tensor<string, []>("op_2484_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2484_pad_0 = const()[name = tensor<string, []>("op_2484_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2484_dilations_0 = const()[name = tensor<string, []>("op_2484_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2484_groups_0 = const()[name = tensor<string, []>("op_2484_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_6_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(106614912))), name = tensor<string, []>("layers_6_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72111]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(106470592))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_6_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146465152))), name = tensor<string, []>("layers_6_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72111]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146320832))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2484_cast_fp16 = conv(dilations = var_2484_dilations_0, groups = var_2484_groups_0, pad = var_2484_pad_0, pad_type = var_2484_pad_type_0, strides = var_2484_strides_0, weight = layers_6_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_173_cast_fp16)[name = tensor<string, []>("op_2484_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_175_cast_fp16 = add(x = var_2478_cast_fp16, y = var_2484_cast_fp16)[name = tensor<string, []>("input_175_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_177_cast_fp16 = silu(x = input_175_cast_fp16)[name = tensor<string, []>("input_177_cast_fp16")];
@@ -1732,14 +1732,14 @@ program(1.0)
             tensor<int32, [4]> var_2495_pad_0 = const()[name = tensor<string, []>("op_2495_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2495_dilations_0 = const()[name = tensor<string, []>("op_2495_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2495_groups_0 = const()[name = tensor<string, []>("op_2495_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_6_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(107139264))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109236480))), name = tensor<string, []>("layers_6_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_6_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146989504))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(150135296))), name = tensor<string, []>("layers_6_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2495_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2495_dilations_0, groups = var_2495_groups_0, pad = var_2495_pad_0, pad_type = var_2495_pad_type_0, strides = var_2495_strides_0, weight = layers_6_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_177_cast_fp16)[name = tensor<string, []>("op_2495_cast_fp16")];
             tensor<string, []> var_2501_pad_type_0 = const()[name = tensor<string, []>("op_2501_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2501_strides_0 = const()[name = tensor<string, []>("op_2501_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2501_pad_0 = const()[name = tensor<string, []>("op_2501_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2501_dilations_0 = const()[name = tensor<string, []>("op_2501_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2501_groups_0 = const()[name = tensor<string, []>("op_2501_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_6_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109415808))), name = tensor<string, []>("layers_6_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89542]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109236608))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_6_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(150314688))), name = tensor<string, []>("layers_6_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89542]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(150135488))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2501_cast_fp16 = conv(dilations = var_2501_dilations_0, groups = var_2501_groups_0, pad = var_2501_pad_0, pad_type = var_2501_pad_type_0, strides = var_2501_strides_0, weight = layers_6_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_177_cast_fp16)[name = tensor<string, []>("op_2501_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_39_cast_fp16 = add(x = var_2495_cast_fp16, y = var_2501_cast_fp16)[name = tensor<string, []>("x_39_cast_fp16")];
             tensor<fp16, []> var_2503_to_fp16 = const()[name = tensor<string, []>("op_2503_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -1748,8 +1748,8 @@ program(1.0)
             tensor<int32, [1]> out_63_axes_0 = const()[name = tensor<string, []>("out_63_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2514_to_fp16 = const()[name = tensor<string, []>("op_2514_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_63_cast_fp16 = layer_norm(axes = out_63_axes_0, epsilon = var_2514_to_fp16, x = inputs_63_cast_fp16)[name = tensor<string, []>("out_63_cast_fp16")];
-            tensor<fp16, [1024]> obj_27_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_27_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109940160)))];
-            tensor<fp16, [1024]> obj_27_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_27_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109942272)))];
+            tensor<fp16, [1024]> obj_27_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_27_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(150839040)))];
+            tensor<fp16, [1024]> obj_27_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_27_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(150841152)))];
             tensor<fp16, []> obj_27_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_27_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_27_cast_fp16 = batch_norm(beta = obj_27_beta_0_to_fp16, epsilon = obj_27_epsilon_0_to_fp16, gamma = obj_27_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_63_cast_fp16)[name = tensor<string, []>("obj_27_cast_fp16")];
             tensor<string, []> var_2539_pad_type_0 = const()[name = tensor<string, []>("op_2539_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1757,14 +1757,14 @@ program(1.0)
             tensor<int32, [4]> var_2539_pad_0 = const()[name = tensor<string, []>("op_2539_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2539_dilations_0 = const()[name = tensor<string, []>("op_2539_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2539_groups_0 = const()[name = tensor<string, []>("op_2539_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(109944384))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(110468736))), name = tensor<string, []>("layers_6_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(150843264))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151629760))), name = tensor<string, []>("layers_6_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2539_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2539_dilations_0, groups = var_2539_groups_0, pad = var_2539_pad_0, pad_type = var_2539_pad_type_0, strides = var_2539_strides_0, weight = layers_6_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_27_cast_fp16)[name = tensor<string, []>("op_2539_cast_fp16")];
             tensor<string, []> var_2545_pad_type_0 = const()[name = tensor<string, []>("op_2545_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2545_strides_0 = const()[name = tensor<string, []>("op_2545_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2545_pad_0 = const()[name = tensor<string, []>("op_2545_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2545_dilations_0 = const()[name = tensor<string, []>("op_2545_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2545_groups_0 = const()[name = tensor<string, []>("op_2545_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(110503168))), name = tensor<string, []>("layers_6_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17101]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(110468864))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151664256))), name = tensor<string, []>("layers_6_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17101]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151629952))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2545_cast_fp16 = conv(dilations = var_2545_dilations_0, groups = var_2545_groups_0, pad = var_2545_pad_0, pad_type = var_2545_pad_type_0, strides = var_2545_strides_0, weight = layers_6_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_27_cast_fp16)[name = tensor<string, []>("op_2545_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_25_cast_fp16 = add(x = var_2539_cast_fp16, y = var_2545_cast_fp16)[name = tensor<string, []>("query_25_cast_fp16")];
             tensor<string, []> var_2554_pad_type_0 = const()[name = tensor<string, []>("op_2554_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1772,14 +1772,14 @@ program(1.0)
             tensor<int32, [4]> var_2554_pad_0 = const()[name = tensor<string, []>("op_2554_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2554_dilations_0 = const()[name = tensor<string, []>("op_2554_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2554_groups_0 = const()[name = tensor<string, []>("op_2554_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(110634304))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111158656))), name = tensor<string, []>("layers_6_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151795392))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(152581888))), name = tensor<string, []>("layers_6_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2554_cast_fp16 = conv(dilations = var_2554_dilations_0, groups = var_2554_groups_0, pad = var_2554_pad_0, pad_type = var_2554_pad_type_0, strides = var_2554_strides_0, weight = layers_6_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_27_cast_fp16)[name = tensor<string, []>("op_2554_cast_fp16")];
             tensor<string, []> var_2560_pad_type_0 = const()[name = tensor<string, []>("op_2560_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2560_strides_0 = const()[name = tensor<string, []>("op_2560_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2560_pad_0 = const()[name = tensor<string, []>("op_2560_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2560_dilations_0 = const()[name = tensor<string, []>("op_2560_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2560_groups_0 = const()[name = tensor<string, []>("op_2560_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111202624))), name = tensor<string, []>("layers_6_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [21860]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111158784))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(152625920))), name = tensor<string, []>("layers_6_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [21860]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(152582080))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2560_cast_fp16 = conv(dilations = var_2560_dilations_0, groups = var_2560_groups_0, pad = var_2560_pad_0, pad_type = var_2560_pad_type_0, strides = var_2560_strides_0, weight = layers_6_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_27_cast_fp16)[name = tensor<string, []>("op_2560_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_13_cast_fp16 = add(x = var_2554_cast_fp16, y = var_2560_cast_fp16)[name = tensor<string, []>("key_13_cast_fp16")];
             tensor<string, []> var_2570_pad_type_0 = const()[name = tensor<string, []>("op_2570_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1787,33 +1787,33 @@ program(1.0)
             tensor<int32, [4]> var_2570_pad_0 = const()[name = tensor<string, []>("op_2570_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2570_dilations_0 = const()[name = tensor<string, []>("op_2570_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2570_groups_0 = const()[name = tensor<string, []>("op_2570_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111333760))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111858112))), name = tensor<string, []>("layers_6_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(152757056))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(153543552))), name = tensor<string, []>("layers_6_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2570_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2570_dilations_0, groups = var_2570_groups_0, pad = var_2570_pad_0, pad_type = var_2570_pad_type_0, strides = var_2570_strides_0, weight = layers_6_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_27_cast_fp16)[name = tensor<string, []>("op_2570_cast_fp16")];
             tensor<string, []> var_2576_pad_type_0 = const()[name = tensor<string, []>("op_2576_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2576_strides_0 = const()[name = tensor<string, []>("op_2576_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2576_pad_0 = const()[name = tensor<string, []>("op_2576_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2576_dilations_0 = const()[name = tensor<string, []>("op_2576_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2576_groups_0 = const()[name = tensor<string, []>("op_2576_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111891520))), name = tensor<string, []>("layers_6_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16588]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(111858240))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(153577024))), name = tensor<string, []>("layers_6_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16588]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(153543744))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2576_cast_fp16 = conv(dilations = var_2576_dilations_0, groups = var_2576_groups_0, pad = var_2576_pad_0, pad_type = var_2576_pad_type_0, strides = var_2576_strides_0, weight = layers_6_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_27_cast_fp16)[name = tensor<string, []>("op_2576_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_13_cast_fp16 = add(x = var_2570_cast_fp16, y = var_2576_cast_fp16)[name = tensor<string, []>("value_13_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_2579_to_fp16 = const()[name = tensor<string, []>("op_2579_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112022656)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_2579_to_fp16 = const()[name = tensor<string, []>("op_2579_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(153708160)))];
             tensor<fp16, [1, 1024, 1, 188]> query_27_cast_fp16 = add(x = query_25_cast_fp16, y = var_2579_to_fp16)[name = tensor<string, []>("query_27_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_2582_to_fp16 = const()[name = tensor<string, []>("op_2582_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112024768)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_2582_to_fp16 = const()[name = tensor<string, []>("op_2582_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(153710272)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_13_cast_fp16 = add(x = query_25_cast_fp16, y = var_2582_to_fp16)[name = tensor<string, []>("q_with_bias_v_13_cast_fp16")];
             tensor<string, []> var_2592_pad_type_0 = const()[name = tensor<string, []>("op_2592_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2592_strides_0 = const()[name = tensor<string, []>("op_2592_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2592_pad_0 = const()[name = tensor<string, []>("op_2592_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2592_dilations_0 = const()[name = tensor<string, []>("op_2592_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2592_groups_0 = const()[name = tensor<string, []>("op_2592_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112026880))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112551232))), name = tensor<string, []>("layers_6_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(153712384))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(154498880))), name = tensor<string, []>("layers_6_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_2592_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2592_dilations_0, groups = var_2592_groups_0, pad = var_2592_pad_0, pad_type = var_2592_pad_type_0, strides = var_2592_strides_0, weight = layers_6_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_2592_cast_fp16")];
             tensor<string, []> var_2598_pad_type_0 = const()[name = tensor<string, []>("op_2598_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2598_strides_0 = const()[name = tensor<string, []>("op_2598_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2598_pad_0 = const()[name = tensor<string, []>("op_2598_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2598_dilations_0 = const()[name = tensor<string, []>("op_2598_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2598_groups_0 = const()[name = tensor<string, []>("op_2598_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112630784))), name = tensor<string, []>("layers_6_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [39657]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112551360))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(154578496))), name = tensor<string, []>("layers_6_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [39657]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(154499072))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_2598_cast_fp16 = conv(dilations = var_2598_dilations_0, groups = var_2598_groups_0, pad = var_2598_pad_0, pad_type = var_2598_pad_type_0, strides = var_2598_strides_0, weight = layers_6_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_2598_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_13_cast_fp16 = add(x = var_2592_cast_fp16, y = var_2598_cast_fp16)[name = tensor<string, []>("p_13_cast_fp16")];
             tensor<int32, [4]> var_2602 = const()[name = tensor<string, []>("op_2602"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -1864,22 +1864,22 @@ program(1.0)
             tensor<int32, [4]> var_2655_pad_0 = const()[name = tensor<string, []>("op_2655_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2655_dilations_0 = const()[name = tensor<string, []>("op_2655_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2655_groups_0 = const()[name = tensor<string, []>("op_2655_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(112761920))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(113286272))), name = tensor<string, []>("layers_6_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(154709632))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(155496128))), name = tensor<string, []>("layers_6_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2655_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2655_dilations_0, groups = var_2655_groups_0, pad = var_2655_pad_0, pad_type = var_2655_pad_type_0, strides = var_2655_strides_0, weight = layers_6_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_179_cast_fp16)[name = tensor<string, []>("op_2655_cast_fp16")];
             tensor<string, []> var_2661_pad_type_0 = const()[name = tensor<string, []>("op_2661_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2661_strides_0 = const()[name = tensor<string, []>("op_2661_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2661_pad_0 = const()[name = tensor<string, []>("op_2661_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2661_dilations_0 = const()[name = tensor<string, []>("op_2661_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2661_groups_0 = const()[name = tensor<string, []>("op_2661_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(113316992))), name = tensor<string, []>("layers_6_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15262]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(113286400))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(155526912))), name = tensor<string, []>("layers_6_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15262]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(155496320))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2661_cast_fp16 = conv(dilations = var_2661_dilations_0, groups = var_2661_groups_0, pad = var_2661_pad_0, pad_type = var_2661_pad_type_0, strides = var_2661_strides_0, weight = layers_6_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_179_cast_fp16)[name = tensor<string, []>("op_2661_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_29_cast_fp16 = add(x = var_2655_cast_fp16, y = var_2661_cast_fp16)[name = tensor<string, []>("obj_29_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_65_cast_fp16 = add(x = inputs_63_cast_fp16, y = obj_29_cast_fp16)[name = tensor<string, []>("inputs_65_cast_fp16")];
             tensor<int32, [1]> out_65_axes_0 = const()[name = tensor<string, []>("out_65_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2672_to_fp16 = const()[name = tensor<string, []>("op_2672_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_65_cast_fp16 = layer_norm(axes = out_65_axes_0, epsilon = var_2672_to_fp16, x = inputs_65_cast_fp16)[name = tensor<string, []>("out_65_cast_fp16")];
-            tensor<fp16, [1024]> input_181_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_181_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(113448128)))];
-            tensor<fp16, [1024]> input_181_beta_0_to_fp16 = const()[name = tensor<string, []>("input_181_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(113450240)))];
+            tensor<fp16, [1024]> input_181_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_181_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(155658048)))];
+            tensor<fp16, [1024]> input_181_beta_0_to_fp16 = const()[name = tensor<string, []>("input_181_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(155660160)))];
             tensor<fp16, []> input_181_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_181_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_181_cast_fp16 = batch_norm(beta = input_181_beta_0_to_fp16, epsilon = input_181_epsilon_0_to_fp16, gamma = input_181_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_65_cast_fp16)[name = tensor<string, []>("input_181_cast_fp16")];
             tensor<string, []> var_2693_pad_type_0 = const()[name = tensor<string, []>("op_2693_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1887,14 +1887,14 @@ program(1.0)
             tensor<int32, [4]> var_2693_pad_0 = const()[name = tensor<string, []>("op_2693_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2693_dilations_0 = const()[name = tensor<string, []>("op_2693_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2693_groups_0 = const()[name = tensor<string, []>("op_2693_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_6_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(113452352))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(114500992))), name = tensor<string, []>("layers_6_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_6_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(155662272))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(157235200))), name = tensor<string, []>("layers_6_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_2693_cast_fp16 = conv(dilations = var_2693_dilations_0, groups = var_2693_groups_0, pad = var_2693_pad_0, pad_type = var_2693_pad_type_0, strides = var_2693_strides_0, weight = layers_6_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_181_cast_fp16)[name = tensor<string, []>("op_2693_cast_fp16")];
             tensor<string, []> var_2699_pad_type_0 = const()[name = tensor<string, []>("op_2699_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2699_strides_0 = const()[name = tensor<string, []>("op_2699_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2699_pad_0 = const()[name = tensor<string, []>("op_2699_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2699_dilations_0 = const()[name = tensor<string, []>("op_2699_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2699_groups_0 = const()[name = tensor<string, []>("op_2699_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_6_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(114565312))), name = tensor<string, []>("layers_6_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32041]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(114501120))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_6_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(157299584))), name = tensor<string, []>("layers_6_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32041]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(157235392))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_2699_cast_fp16 = conv(dilations = var_2699_dilations_0, groups = var_2699_groups_0, pad = var_2699_pad_0, pad_type = var_2699_pad_type_0, strides = var_2699_strides_0, weight = layers_6_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_181_cast_fp16)[name = tensor<string, []>("op_2699_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_183_cast_fp16 = add(x = var_2693_cast_fp16, y = var_2699_cast_fp16)[name = tensor<string, []>("input_183_cast_fp16")];
             tensor<int32, []> input_185_split_num_splits_0 = const()[name = tensor<string, []>("input_185_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -1907,8 +1907,8 @@ program(1.0)
             tensor<int32, []> input_187_groups_0 = const()[name = tensor<string, []>("input_187_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_187_strides_0 = const()[name = tensor<string, []>("input_187_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_187_dilations_0 = const()[name = tensor<string, []>("input_187_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_280_to_fp16 = const()[name = tensor<string, []>("const_280_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(114827520)))];
-            tensor<fp16, [1024]> const_281_to_fp16 = const()[name = tensor<string, []>("const_281_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(114846016)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_280_to_fp16 = const()[name = tensor<string, []>("const_280_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(157561792)))];
+            tensor<fp16, [1024]> const_281_to_fp16 = const()[name = tensor<string, []>("const_281_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(157580288)))];
             tensor<fp16, [1, 1024, 1, 188]> input_189_cast_fp16 = conv(bias = const_281_to_fp16, dilations = input_187_dilations_0, groups = input_187_groups_0, pad = input_187_pad_0, pad_type = input_187_pad_type_0, strides = input_187_strides_0, weight = const_280_to_fp16, x = input_185_cast_fp16)[name = tensor<string, []>("input_189_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_191_cast_fp16 = silu(x = input_189_cast_fp16)[name = tensor<string, []>("input_191_cast_fp16")];
             tensor<string, []> var_2721_pad_type_0 = const()[name = tensor<string, []>("op_2721_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1916,22 +1916,22 @@ program(1.0)
             tensor<int32, [4]> var_2721_pad_0 = const()[name = tensor<string, []>("op_2721_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2721_dilations_0 = const()[name = tensor<string, []>("op_2721_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2721_groups_0 = const()[name = tensor<string, []>("op_2721_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(114848128))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(115372480))), name = tensor<string, []>("layers_6_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(157582400))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(158368896))), name = tensor<string, []>("layers_6_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2721_cast_fp16 = conv(dilations = var_2721_dilations_0, groups = var_2721_groups_0, pad = var_2721_pad_0, pad_type = var_2721_pad_type_0, strides = var_2721_strides_0, weight = layers_6_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_191_cast_fp16)[name = tensor<string, []>("op_2721_cast_fp16")];
             tensor<string, []> var_2727_pad_type_0 = const()[name = tensor<string, []>("op_2727_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2727_strides_0 = const()[name = tensor<string, []>("op_2727_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2727_pad_0 = const()[name = tensor<string, []>("op_2727_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2727_dilations_0 = const()[name = tensor<string, []>("op_2727_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2727_groups_0 = const()[name = tensor<string, []>("op_2727_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_6_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(115404032))), name = tensor<string, []>("layers_6_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15663]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(115372608))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_6_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(158400512))), name = tensor<string, []>("layers_6_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15663]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(158369088))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2727_cast_fp16 = conv(dilations = var_2727_dilations_0, groups = var_2727_groups_0, pad = var_2727_pad_0, pad_type = var_2727_pad_type_0, strides = var_2727_strides_0, weight = layers_6_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_191_cast_fp16)[name = tensor<string, []>("op_2727_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_41_cast_fp16 = add(x = var_2721_cast_fp16, y = var_2727_cast_fp16)[name = tensor<string, []>("x_41_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_67_cast_fp16 = add(x = inputs_65_cast_fp16, y = x_41_cast_fp16)[name = tensor<string, []>("inputs_67_cast_fp16")];
             tensor<int32, [1]> out_67_axes_0 = const()[name = tensor<string, []>("out_67_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2738_to_fp16 = const()[name = tensor<string, []>("op_2738_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_67_cast_fp16 = layer_norm(axes = out_67_axes_0, epsilon = var_2738_to_fp16, x = inputs_67_cast_fp16)[name = tensor<string, []>("out_67_cast_fp16")];
-            tensor<fp16, [1024]> input_193_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_193_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(115535168)))];
-            tensor<fp16, [1024]> input_193_beta_0_to_fp16 = const()[name = tensor<string, []>("input_193_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(115537280)))];
+            tensor<fp16, [1024]> input_193_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_193_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(158531648)))];
+            tensor<fp16, [1024]> input_193_beta_0_to_fp16 = const()[name = tensor<string, []>("input_193_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(158533760)))];
             tensor<fp16, []> input_193_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_193_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_193_cast_fp16 = batch_norm(beta = input_193_beta_0_to_fp16, epsilon = input_193_epsilon_0_to_fp16, gamma = input_193_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_67_cast_fp16)[name = tensor<string, []>("input_193_cast_fp16")];
             tensor<string, []> var_2758_pad_type_0 = const()[name = tensor<string, []>("op_2758_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1939,14 +1939,14 @@ program(1.0)
             tensor<int32, [4]> var_2758_pad_0 = const()[name = tensor<string, []>("op_2758_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2758_dilations_0 = const()[name = tensor<string, []>("op_2758_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2758_groups_0 = const()[name = tensor<string, []>("op_2758_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_6_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(115539392))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(117636608))), name = tensor<string, []>("layers_6_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_6_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(158535872))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(161681664))), name = tensor<string, []>("layers_6_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2758_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_2758_dilations_0, groups = var_2758_groups_0, pad = var_2758_pad_0, pad_type = var_2758_pad_type_0, strides = var_2758_strides_0, weight = layers_6_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_193_cast_fp16)[name = tensor<string, []>("op_2758_cast_fp16")];
             tensor<string, []> var_2764_pad_type_0 = const()[name = tensor<string, []>("op_2764_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2764_strides_0 = const()[name = tensor<string, []>("op_2764_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2764_pad_0 = const()[name = tensor<string, []>("op_2764_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2764_dilations_0 = const()[name = tensor<string, []>("op_2764_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2764_groups_0 = const()[name = tensor<string, []>("op_2764_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_6_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(117781312))), name = tensor<string, []>("layers_6_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72232]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(117636736))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_6_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(161826432))), name = tensor<string, []>("layers_6_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72232]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(161681856))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2764_cast_fp16 = conv(dilations = var_2764_dilations_0, groups = var_2764_groups_0, pad = var_2764_pad_0, pad_type = var_2764_pad_type_0, strides = var_2764_strides_0, weight = layers_6_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_193_cast_fp16)[name = tensor<string, []>("op_2764_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_195_cast_fp16 = add(x = var_2758_cast_fp16, y = var_2764_cast_fp16)[name = tensor<string, []>("input_195_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_197_cast_fp16 = silu(x = input_195_cast_fp16)[name = tensor<string, []>("input_197_cast_fp16")];
@@ -1955,14 +1955,14 @@ program(1.0)
             tensor<int32, [4]> var_2775_pad_0 = const()[name = tensor<string, []>("op_2775_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2775_dilations_0 = const()[name = tensor<string, []>("op_2775_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2775_groups_0 = const()[name = tensor<string, []>("op_2775_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_6_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(118305664))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(120402880))), name = tensor<string, []>("layers_6_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_6_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(162350784))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(165496576))), name = tensor<string, []>("layers_6_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2775_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2775_dilations_0, groups = var_2775_groups_0, pad = var_2775_pad_0, pad_type = var_2775_pad_type_0, strides = var_2775_strides_0, weight = layers_6_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_197_cast_fp16)[name = tensor<string, []>("op_2775_cast_fp16")];
             tensor<string, []> var_2781_pad_type_0 = const()[name = tensor<string, []>("op_2781_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2781_strides_0 = const()[name = tensor<string, []>("op_2781_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2781_pad_0 = const()[name = tensor<string, []>("op_2781_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2781_dilations_0 = const()[name = tensor<string, []>("op_2781_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2781_groups_0 = const()[name = tensor<string, []>("op_2781_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_6_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(120554496))), name = tensor<string, []>("layers_6_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75708]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(120403008))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_6_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(165648256))), name = tensor<string, []>("layers_6_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75708]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(165496768))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2781_cast_fp16 = conv(dilations = var_2781_dilations_0, groups = var_2781_groups_0, pad = var_2781_pad_0, pad_type = var_2781_pad_type_0, strides = var_2781_strides_0, weight = layers_6_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_197_cast_fp16)[name = tensor<string, []>("op_2781_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_43_cast_fp16 = add(x = var_2775_cast_fp16, y = var_2781_cast_fp16)[name = tensor<string, []>("x_43_cast_fp16")];
             tensor<fp16, []> var_2783_to_fp16 = const()[name = tensor<string, []>("op_2783_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -1971,16 +1971,16 @@ program(1.0)
             tensor<int32, [1]> out_69_axes_0 = const()[name = tensor<string, []>("out_69_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2794_to_fp16 = const()[name = tensor<string, []>("op_2794_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_69_cast_fp16 = layer_norm(axes = out_69_axes_0, epsilon = var_2794_to_fp16, x = inputs_69_cast_fp16)[name = tensor<string, []>("out_69_cast_fp16")];
-            tensor<fp16, [1024]> inputs_71_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_71_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(121078848)))];
-            tensor<fp16, [1024]> inputs_71_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_71_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(121080960)))];
+            tensor<fp16, [1024]> inputs_71_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_71_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(166172608)))];
+            tensor<fp16, [1024]> inputs_71_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_71_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(166174720)))];
             tensor<fp16, []> inputs_71_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_71_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_71_cast_fp16 = batch_norm(beta = inputs_71_beta_0_to_fp16, epsilon = inputs_71_epsilon_0_to_fp16, gamma = inputs_71_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_69_cast_fp16)[name = tensor<string, []>("inputs_71_cast_fp16")];
             tensor<int32, []> var_2808 = const()[name = tensor<string, []>("op_2808"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_71_axes_0 = const()[name = tensor<string, []>("out_71_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2839_to_fp16 = const()[name = tensor<string, []>("op_2839_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_71_cast_fp16 = layer_norm(axes = out_71_axes_0, epsilon = var_2839_to_fp16, x = inputs_71_cast_fp16)[name = tensor<string, []>("out_71_cast_fp16")];
-            tensor<fp16, [1024]> input_199_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_199_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(121083072)))];
-            tensor<fp16, [1024]> input_199_beta_0_to_fp16 = const()[name = tensor<string, []>("input_199_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(121085184)))];
+            tensor<fp16, [1024]> input_199_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_199_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(166176832)))];
+            tensor<fp16, [1024]> input_199_beta_0_to_fp16 = const()[name = tensor<string, []>("input_199_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(166178944)))];
             tensor<fp16, []> input_199_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_199_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_199_cast_fp16 = batch_norm(beta = input_199_beta_0_to_fp16, epsilon = input_199_epsilon_0_to_fp16, gamma = input_199_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_71_cast_fp16)[name = tensor<string, []>("input_199_cast_fp16")];
             tensor<string, []> var_2859_pad_type_0 = const()[name = tensor<string, []>("op_2859_pad_type_0"), val = tensor<string, []>("valid")];
@@ -1988,14 +1988,14 @@ program(1.0)
             tensor<int32, [4]> var_2859_pad_0 = const()[name = tensor<string, []>("op_2859_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2859_dilations_0 = const()[name = tensor<string, []>("op_2859_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2859_groups_0 = const()[name = tensor<string, []>("op_2859_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_7_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(121087296))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(123184512))), name = tensor<string, []>("layers_7_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_7_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(166181056))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(169326848))), name = tensor<string, []>("layers_7_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2859_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_2859_dilations_0, groups = var_2859_groups_0, pad = var_2859_pad_0, pad_type = var_2859_pad_type_0, strides = var_2859_strides_0, weight = layers_7_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_199_cast_fp16)[name = tensor<string, []>("op_2859_cast_fp16")];
             tensor<string, []> var_2865_pad_type_0 = const()[name = tensor<string, []>("op_2865_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2865_strides_0 = const()[name = tensor<string, []>("op_2865_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2865_pad_0 = const()[name = tensor<string, []>("op_2865_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2865_dilations_0 = const()[name = tensor<string, []>("op_2865_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2865_groups_0 = const()[name = tensor<string, []>("op_2865_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_7_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(123327104))), name = tensor<string, []>("layers_7_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [71174]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(123184640))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_7_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(169469504))), name = tensor<string, []>("layers_7_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [71174]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(169327040))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_2865_cast_fp16 = conv(dilations = var_2865_dilations_0, groups = var_2865_groups_0, pad = var_2865_pad_0, pad_type = var_2865_pad_type_0, strides = var_2865_strides_0, weight = layers_7_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_199_cast_fp16)[name = tensor<string, []>("op_2865_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_201_cast_fp16 = add(x = var_2859_cast_fp16, y = var_2865_cast_fp16)[name = tensor<string, []>("input_201_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_203_cast_fp16 = silu(x = input_201_cast_fp16)[name = tensor<string, []>("input_203_cast_fp16")];
@@ -2004,14 +2004,14 @@ program(1.0)
             tensor<int32, [4]> var_2876_pad_0 = const()[name = tensor<string, []>("op_2876_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2876_dilations_0 = const()[name = tensor<string, []>("op_2876_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2876_groups_0 = const()[name = tensor<string, []>("op_2876_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_7_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(123851456))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(125948672))), name = tensor<string, []>("layers_7_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_7_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(169993856))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173139648))), name = tensor<string, []>("layers_7_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2876_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2876_dilations_0, groups = var_2876_groups_0, pad = var_2876_pad_0, pad_type = var_2876_pad_type_0, strides = var_2876_strides_0, weight = layers_7_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_203_cast_fp16)[name = tensor<string, []>("op_2876_cast_fp16")];
             tensor<string, []> var_2882_pad_type_0 = const()[name = tensor<string, []>("op_2882_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2882_strides_0 = const()[name = tensor<string, []>("op_2882_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2882_pad_0 = const()[name = tensor<string, []>("op_2882_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2882_dilations_0 = const()[name = tensor<string, []>("op_2882_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2882_groups_0 = const()[name = tensor<string, []>("op_2882_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_7_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(126120576))), name = tensor<string, []>("layers_7_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [85830]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(125948800))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_7_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173311616))), name = tensor<string, []>("layers_7_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [85830]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173139840))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2882_cast_fp16 = conv(dilations = var_2882_dilations_0, groups = var_2882_groups_0, pad = var_2882_pad_0, pad_type = var_2882_pad_type_0, strides = var_2882_strides_0, weight = layers_7_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_203_cast_fp16)[name = tensor<string, []>("op_2882_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_45_cast_fp16 = add(x = var_2876_cast_fp16, y = var_2882_cast_fp16)[name = tensor<string, []>("x_45_cast_fp16")];
             tensor<fp16, []> var_2884_to_fp16 = const()[name = tensor<string, []>("op_2884_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -2020,8 +2020,8 @@ program(1.0)
             tensor<int32, [1]> out_73_axes_0 = const()[name = tensor<string, []>("out_73_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_2895_to_fp16 = const()[name = tensor<string, []>("op_2895_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_73_cast_fp16 = layer_norm(axes = out_73_axes_0, epsilon = var_2895_to_fp16, x = inputs_73_cast_fp16)[name = tensor<string, []>("out_73_cast_fp16")];
-            tensor<fp16, [1024]> obj_31_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_31_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(126644928)))];
-            tensor<fp16, [1024]> obj_31_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_31_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(126647040)))];
+            tensor<fp16, [1024]> obj_31_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_31_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173835968)))];
+            tensor<fp16, [1024]> obj_31_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_31_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173838080)))];
             tensor<fp16, []> obj_31_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_31_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_31_cast_fp16 = batch_norm(beta = obj_31_beta_0_to_fp16, epsilon = obj_31_epsilon_0_to_fp16, gamma = obj_31_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_73_cast_fp16)[name = tensor<string, []>("obj_31_cast_fp16")];
             tensor<string, []> var_2920_pad_type_0 = const()[name = tensor<string, []>("op_2920_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2029,14 +2029,14 @@ program(1.0)
             tensor<int32, [4]> var_2920_pad_0 = const()[name = tensor<string, []>("op_2920_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2920_dilations_0 = const()[name = tensor<string, []>("op_2920_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2920_groups_0 = const()[name = tensor<string, []>("op_2920_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(126649152))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127173504))), name = tensor<string, []>("layers_7_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173840192))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(174626688))), name = tensor<string, []>("layers_7_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2920_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2920_dilations_0, groups = var_2920_groups_0, pad = var_2920_pad_0, pad_type = var_2920_pad_type_0, strides = var_2920_strides_0, weight = layers_7_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_31_cast_fp16)[name = tensor<string, []>("op_2920_cast_fp16")];
             tensor<string, []> var_2926_pad_type_0 = const()[name = tensor<string, []>("op_2926_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2926_strides_0 = const()[name = tensor<string, []>("op_2926_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2926_pad_0 = const()[name = tensor<string, []>("op_2926_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2926_dilations_0 = const()[name = tensor<string, []>("op_2926_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2926_groups_0 = const()[name = tensor<string, []>("op_2926_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127206336))), name = tensor<string, []>("layers_7_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16292]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127173632))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(174659584))), name = tensor<string, []>("layers_7_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16292]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(174626880))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2926_cast_fp16 = conv(dilations = var_2926_dilations_0, groups = var_2926_groups_0, pad = var_2926_pad_0, pad_type = var_2926_pad_type_0, strides = var_2926_strides_0, weight = layers_7_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_31_cast_fp16)[name = tensor<string, []>("op_2926_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_29_cast_fp16 = add(x = var_2920_cast_fp16, y = var_2926_cast_fp16)[name = tensor<string, []>("query_29_cast_fp16")];
             tensor<string, []> var_2935_pad_type_0 = const()[name = tensor<string, []>("op_2935_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2044,14 +2044,14 @@ program(1.0)
             tensor<int32, [4]> var_2935_pad_0 = const()[name = tensor<string, []>("op_2935_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2935_dilations_0 = const()[name = tensor<string, []>("op_2935_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2935_groups_0 = const()[name = tensor<string, []>("op_2935_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127337472))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127861824))), name = tensor<string, []>("layers_7_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(174790720))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(175577216))), name = tensor<string, []>("layers_7_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2935_cast_fp16 = conv(dilations = var_2935_dilations_0, groups = var_2935_groups_0, pad = var_2935_pad_0, pad_type = var_2935_pad_type_0, strides = var_2935_strides_0, weight = layers_7_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_31_cast_fp16)[name = tensor<string, []>("op_2935_cast_fp16")];
             tensor<string, []> var_2941_pad_type_0 = const()[name = tensor<string, []>("op_2941_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2941_strides_0 = const()[name = tensor<string, []>("op_2941_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2941_pad_0 = const()[name = tensor<string, []>("op_2941_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2941_dilations_0 = const()[name = tensor<string, []>("op_2941_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2941_groups_0 = const()[name = tensor<string, []>("op_2941_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127907328))), name = tensor<string, []>("layers_7_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [22635]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(127861952))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(175622784))), name = tensor<string, []>("layers_7_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [22635]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(175577408))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2941_cast_fp16 = conv(dilations = var_2941_dilations_0, groups = var_2941_groups_0, pad = var_2941_pad_0, pad_type = var_2941_pad_type_0, strides = var_2941_strides_0, weight = layers_7_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_31_cast_fp16)[name = tensor<string, []>("op_2941_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_15_cast_fp16 = add(x = var_2935_cast_fp16, y = var_2941_cast_fp16)[name = tensor<string, []>("key_15_cast_fp16")];
             tensor<string, []> var_2951_pad_type_0 = const()[name = tensor<string, []>("op_2951_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2059,33 +2059,33 @@ program(1.0)
             tensor<int32, [4]> var_2951_pad_0 = const()[name = tensor<string, []>("op_2951_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2951_dilations_0 = const()[name = tensor<string, []>("op_2951_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2951_groups_0 = const()[name = tensor<string, []>("op_2951_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128038464))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128562816))), name = tensor<string, []>("layers_7_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(175753920))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176540416))), name = tensor<string, []>("layers_7_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2951_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2951_dilations_0, groups = var_2951_groups_0, pad = var_2951_pad_0, pad_type = var_2951_pad_type_0, strides = var_2951_strides_0, weight = layers_7_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_31_cast_fp16)[name = tensor<string, []>("op_2951_cast_fp16")];
             tensor<string, []> var_2957_pad_type_0 = const()[name = tensor<string, []>("op_2957_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2957_strides_0 = const()[name = tensor<string, []>("op_2957_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2957_pad_0 = const()[name = tensor<string, []>("op_2957_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2957_dilations_0 = const()[name = tensor<string, []>("op_2957_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2957_groups_0 = const()[name = tensor<string, []>("op_2957_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128599872))), name = tensor<string, []>("layers_7_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18408]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128562944))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176577536))), name = tensor<string, []>("layers_7_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18408]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176540608))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_2957_cast_fp16 = conv(dilations = var_2957_dilations_0, groups = var_2957_groups_0, pad = var_2957_pad_0, pad_type = var_2957_pad_type_0, strides = var_2957_strides_0, weight = layers_7_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_31_cast_fp16)[name = tensor<string, []>("op_2957_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_15_cast_fp16 = add(x = var_2951_cast_fp16, y = var_2957_cast_fp16)[name = tensor<string, []>("value_15_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_2960_to_fp16 = const()[name = tensor<string, []>("op_2960_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128731008)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_2960_to_fp16 = const()[name = tensor<string, []>("op_2960_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176708672)))];
             tensor<fp16, [1, 1024, 1, 188]> query_31_cast_fp16 = add(x = query_29_cast_fp16, y = var_2960_to_fp16)[name = tensor<string, []>("query_31_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_2963_to_fp16 = const()[name = tensor<string, []>("op_2963_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128733120)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_2963_to_fp16 = const()[name = tensor<string, []>("op_2963_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176710784)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_15_cast_fp16 = add(x = query_29_cast_fp16, y = var_2963_to_fp16)[name = tensor<string, []>("q_with_bias_v_15_cast_fp16")];
             tensor<string, []> var_2973_pad_type_0 = const()[name = tensor<string, []>("op_2973_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2973_strides_0 = const()[name = tensor<string, []>("op_2973_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2973_pad_0 = const()[name = tensor<string, []>("op_2973_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2973_dilations_0 = const()[name = tensor<string, []>("op_2973_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2973_groups_0 = const()[name = tensor<string, []>("op_2973_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(128735232))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129259584))), name = tensor<string, []>("layers_7_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176712896))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177499392))), name = tensor<string, []>("layers_7_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_2973_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_2973_dilations_0, groups = var_2973_groups_0, pad = var_2973_pad_0, pad_type = var_2973_pad_type_0, strides = var_2973_strides_0, weight = layers_7_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_2973_cast_fp16")];
             tensor<string, []> var_2979_pad_type_0 = const()[name = tensor<string, []>("op_2979_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_2979_strides_0 = const()[name = tensor<string, []>("op_2979_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_2979_pad_0 = const()[name = tensor<string, []>("op_2979_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_2979_dilations_0 = const()[name = tensor<string, []>("op_2979_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_2979_groups_0 = const()[name = tensor<string, []>("op_2979_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129333504))), name = tensor<string, []>("layers_7_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [36851]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129259712))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177573376))), name = tensor<string, []>("layers_7_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [36851]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177499584))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_2979_cast_fp16 = conv(dilations = var_2979_dilations_0, groups = var_2979_groups_0, pad = var_2979_pad_0, pad_type = var_2979_pad_type_0, strides = var_2979_strides_0, weight = layers_7_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_2979_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_15_cast_fp16 = add(x = var_2973_cast_fp16, y = var_2979_cast_fp16)[name = tensor<string, []>("p_15_cast_fp16")];
             tensor<int32, [4]> var_2983 = const()[name = tensor<string, []>("op_2983"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -2136,22 +2136,22 @@ program(1.0)
             tensor<int32, [4]> var_3036_pad_0 = const()[name = tensor<string, []>("op_3036_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3036_dilations_0 = const()[name = tensor<string, []>("op_3036_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3036_groups_0 = const()[name = tensor<string, []>("op_3036_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129464640))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129988992))), name = tensor<string, []>("layers_7_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177704512))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178491008))), name = tensor<string, []>("layers_7_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3036_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3036_dilations_0, groups = var_3036_groups_0, pad = var_3036_pad_0, pad_type = var_3036_pad_type_0, strides = var_3036_strides_0, weight = layers_7_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_205_cast_fp16)[name = tensor<string, []>("op_3036_cast_fp16")];
             tensor<string, []> var_3042_pad_type_0 = const()[name = tensor<string, []>("op_3042_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3042_strides_0 = const()[name = tensor<string, []>("op_3042_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3042_pad_0 = const()[name = tensor<string, []>("op_3042_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3042_dilations_0 = const()[name = tensor<string, []>("op_3042_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3042_groups_0 = const()[name = tensor<string, []>("op_3042_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130018560))), name = tensor<string, []>("layers_7_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14668]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(129989120))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178520640))), name = tensor<string, []>("layers_7_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14668]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178491200))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3042_cast_fp16 = conv(dilations = var_3042_dilations_0, groups = var_3042_groups_0, pad = var_3042_pad_0, pad_type = var_3042_pad_type_0, strides = var_3042_strides_0, weight = layers_7_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_205_cast_fp16)[name = tensor<string, []>("op_3042_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_33_cast_fp16 = add(x = var_3036_cast_fp16, y = var_3042_cast_fp16)[name = tensor<string, []>("obj_33_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_75_cast_fp16 = add(x = inputs_73_cast_fp16, y = obj_33_cast_fp16)[name = tensor<string, []>("inputs_75_cast_fp16")];
             tensor<int32, [1]> out_75_axes_0 = const()[name = tensor<string, []>("out_75_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3053_to_fp16 = const()[name = tensor<string, []>("op_3053_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_75_cast_fp16 = layer_norm(axes = out_75_axes_0, epsilon = var_3053_to_fp16, x = inputs_75_cast_fp16)[name = tensor<string, []>("out_75_cast_fp16")];
-            tensor<fp16, [1024]> input_207_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_207_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130149696)))];
-            tensor<fp16, [1024]> input_207_beta_0_to_fp16 = const()[name = tensor<string, []>("input_207_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130151808)))];
+            tensor<fp16, [1024]> input_207_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_207_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178651776)))];
+            tensor<fp16, [1024]> input_207_beta_0_to_fp16 = const()[name = tensor<string, []>("input_207_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178653888)))];
             tensor<fp16, []> input_207_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_207_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_207_cast_fp16 = batch_norm(beta = input_207_beta_0_to_fp16, epsilon = input_207_epsilon_0_to_fp16, gamma = input_207_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_75_cast_fp16)[name = tensor<string, []>("input_207_cast_fp16")];
             tensor<string, []> var_3074_pad_type_0 = const()[name = tensor<string, []>("op_3074_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2159,14 +2159,14 @@ program(1.0)
             tensor<int32, [4]> var_3074_pad_0 = const()[name = tensor<string, []>("op_3074_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3074_dilations_0 = const()[name = tensor<string, []>("op_3074_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3074_groups_0 = const()[name = tensor<string, []>("op_3074_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_7_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(130153920))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131202560))), name = tensor<string, []>("layers_7_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_7_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178656000))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180228928))), name = tensor<string, []>("layers_7_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_3074_cast_fp16 = conv(dilations = var_3074_dilations_0, groups = var_3074_groups_0, pad = var_3074_pad_0, pad_type = var_3074_pad_type_0, strides = var_3074_strides_0, weight = layers_7_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_207_cast_fp16)[name = tensor<string, []>("op_3074_cast_fp16")];
             tensor<string, []> var_3080_pad_type_0 = const()[name = tensor<string, []>("op_3080_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3080_strides_0 = const()[name = tensor<string, []>("op_3080_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3080_pad_0 = const()[name = tensor<string, []>("op_3080_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3080_dilations_0 = const()[name = tensor<string, []>("op_3080_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3080_groups_0 = const()[name = tensor<string, []>("op_3080_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_7_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131266240))), name = tensor<string, []>("layers_7_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31719]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131202688))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_7_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180292672))), name = tensor<string, []>("layers_7_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31719]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180229120))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_3080_cast_fp16 = conv(dilations = var_3080_dilations_0, groups = var_3080_groups_0, pad = var_3080_pad_0, pad_type = var_3080_pad_type_0, strides = var_3080_strides_0, weight = layers_7_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_207_cast_fp16)[name = tensor<string, []>("op_3080_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_209_cast_fp16 = add(x = var_3074_cast_fp16, y = var_3080_cast_fp16)[name = tensor<string, []>("input_209_cast_fp16")];
             tensor<int32, []> input_211_split_num_splits_0 = const()[name = tensor<string, []>("input_211_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -2179,8 +2179,8 @@ program(1.0)
             tensor<int32, []> input_213_groups_0 = const()[name = tensor<string, []>("input_213_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_213_strides_0 = const()[name = tensor<string, []>("input_213_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_213_dilations_0 = const()[name = tensor<string, []>("input_213_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_282_to_fp16 = const()[name = tensor<string, []>("const_282_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131528448)))];
-            tensor<fp16, [1024]> const_283_to_fp16 = const()[name = tensor<string, []>("const_283_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131546944)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_282_to_fp16 = const()[name = tensor<string, []>("const_282_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180554880)))];
+            tensor<fp16, [1024]> const_283_to_fp16 = const()[name = tensor<string, []>("const_283_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180573376)))];
             tensor<fp16, [1, 1024, 1, 188]> input_215_cast_fp16 = conv(bias = const_283_to_fp16, dilations = input_213_dilations_0, groups = input_213_groups_0, pad = input_213_pad_0, pad_type = input_213_pad_type_0, strides = input_213_strides_0, weight = const_282_to_fp16, x = input_211_cast_fp16)[name = tensor<string, []>("input_215_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_217_cast_fp16 = silu(x = input_215_cast_fp16)[name = tensor<string, []>("input_217_cast_fp16")];
             tensor<string, []> var_3102_pad_type_0 = const()[name = tensor<string, []>("op_3102_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2188,22 +2188,22 @@ program(1.0)
             tensor<int32, [4]> var_3102_pad_0 = const()[name = tensor<string, []>("op_3102_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3102_dilations_0 = const()[name = tensor<string, []>("op_3102_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3102_groups_0 = const()[name = tensor<string, []>("op_3102_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(131549056))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132073408))), name = tensor<string, []>("layers_7_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180575488))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181361984))), name = tensor<string, []>("layers_7_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3102_cast_fp16 = conv(dilations = var_3102_dilations_0, groups = var_3102_groups_0, pad = var_3102_pad_0, pad_type = var_3102_pad_type_0, strides = var_3102_strides_0, weight = layers_7_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_217_cast_fp16)[name = tensor<string, []>("op_3102_cast_fp16")];
             tensor<string, []> var_3108_pad_type_0 = const()[name = tensor<string, []>("op_3108_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3108_strides_0 = const()[name = tensor<string, []>("op_3108_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3108_pad_0 = const()[name = tensor<string, []>("op_3108_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3108_dilations_0 = const()[name = tensor<string, []>("op_3108_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3108_groups_0 = const()[name = tensor<string, []>("op_3108_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_7_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132104832))), name = tensor<string, []>("layers_7_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15594]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132073536))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_7_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181393472))), name = tensor<string, []>("layers_7_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15594]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181362176))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3108_cast_fp16 = conv(dilations = var_3108_dilations_0, groups = var_3108_groups_0, pad = var_3108_pad_0, pad_type = var_3108_pad_type_0, strides = var_3108_strides_0, weight = layers_7_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_217_cast_fp16)[name = tensor<string, []>("op_3108_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_47_cast_fp16 = add(x = var_3102_cast_fp16, y = var_3108_cast_fp16)[name = tensor<string, []>("x_47_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_77_cast_fp16 = add(x = inputs_75_cast_fp16, y = x_47_cast_fp16)[name = tensor<string, []>("inputs_77_cast_fp16")];
             tensor<int32, [1]> out_77_axes_0 = const()[name = tensor<string, []>("out_77_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3119_to_fp16 = const()[name = tensor<string, []>("op_3119_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_77_cast_fp16 = layer_norm(axes = out_77_axes_0, epsilon = var_3119_to_fp16, x = inputs_77_cast_fp16)[name = tensor<string, []>("out_77_cast_fp16")];
-            tensor<fp16, [1024]> input_219_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_219_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132235968)))];
-            tensor<fp16, [1024]> input_219_beta_0_to_fp16 = const()[name = tensor<string, []>("input_219_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132238080)))];
+            tensor<fp16, [1024]> input_219_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_219_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181524608)))];
+            tensor<fp16, [1024]> input_219_beta_0_to_fp16 = const()[name = tensor<string, []>("input_219_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181526720)))];
             tensor<fp16, []> input_219_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_219_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_219_cast_fp16 = batch_norm(beta = input_219_beta_0_to_fp16, epsilon = input_219_epsilon_0_to_fp16, gamma = input_219_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_77_cast_fp16)[name = tensor<string, []>("input_219_cast_fp16")];
             tensor<string, []> var_3139_pad_type_0 = const()[name = tensor<string, []>("op_3139_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2211,14 +2211,14 @@ program(1.0)
             tensor<int32, [4]> var_3139_pad_0 = const()[name = tensor<string, []>("op_3139_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3139_dilations_0 = const()[name = tensor<string, []>("op_3139_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3139_groups_0 = const()[name = tensor<string, []>("op_3139_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_7_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(132240192))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(134337408))), name = tensor<string, []>("layers_7_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_7_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181528832))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(184674624))), name = tensor<string, []>("layers_7_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3139_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_3139_dilations_0, groups = var_3139_groups_0, pad = var_3139_pad_0, pad_type = var_3139_pad_type_0, strides = var_3139_strides_0, weight = layers_7_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_219_cast_fp16)[name = tensor<string, []>("op_3139_cast_fp16")];
             tensor<string, []> var_3145_pad_type_0 = const()[name = tensor<string, []>("op_3145_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3145_strides_0 = const()[name = tensor<string, []>("op_3145_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3145_pad_0 = const()[name = tensor<string, []>("op_3145_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3145_dilations_0 = const()[name = tensor<string, []>("op_3145_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3145_groups_0 = const()[name = tensor<string, []>("op_3145_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_7_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(134486336))), name = tensor<string, []>("layers_7_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [74343]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(134337536))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_7_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(184823616))), name = tensor<string, []>("layers_7_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [74343]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(184674816))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3145_cast_fp16 = conv(dilations = var_3145_dilations_0, groups = var_3145_groups_0, pad = var_3145_pad_0, pad_type = var_3145_pad_type_0, strides = var_3145_strides_0, weight = layers_7_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_219_cast_fp16)[name = tensor<string, []>("op_3145_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_221_cast_fp16 = add(x = var_3139_cast_fp16, y = var_3145_cast_fp16)[name = tensor<string, []>("input_221_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_223_cast_fp16 = silu(x = input_221_cast_fp16)[name = tensor<string, []>("input_223_cast_fp16")];
@@ -2227,14 +2227,14 @@ program(1.0)
             tensor<int32, [4]> var_3156_pad_0 = const()[name = tensor<string, []>("op_3156_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3156_dilations_0 = const()[name = tensor<string, []>("op_3156_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3156_groups_0 = const()[name = tensor<string, []>("op_3156_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_7_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(135010688))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(137107904))), name = tensor<string, []>("layers_7_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_7_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(185347968))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(188493760))), name = tensor<string, []>("layers_7_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3156_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3156_dilations_0, groups = var_3156_groups_0, pad = var_3156_pad_0, pad_type = var_3156_pad_type_0, strides = var_3156_strides_0, weight = layers_7_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_223_cast_fp16)[name = tensor<string, []>("op_3156_cast_fp16")];
             tensor<string, []> var_3162_pad_type_0 = const()[name = tensor<string, []>("op_3162_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3162_strides_0 = const()[name = tensor<string, []>("op_3162_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3162_pad_0 = const()[name = tensor<string, []>("op_3162_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3162_dilations_0 = const()[name = tensor<string, []>("op_3162_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3162_groups_0 = const()[name = tensor<string, []>("op_3162_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_7_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(137258240))), name = tensor<string, []>("layers_7_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75045]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(137108032))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_7_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(188644160))), name = tensor<string, []>("layers_7_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75045]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(188493952))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3162_cast_fp16 = conv(dilations = var_3162_dilations_0, groups = var_3162_groups_0, pad = var_3162_pad_0, pad_type = var_3162_pad_type_0, strides = var_3162_strides_0, weight = layers_7_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_223_cast_fp16)[name = tensor<string, []>("op_3162_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_49_cast_fp16 = add(x = var_3156_cast_fp16, y = var_3162_cast_fp16)[name = tensor<string, []>("x_49_cast_fp16")];
             tensor<fp16, []> var_3164_to_fp16 = const()[name = tensor<string, []>("op_3164_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -2243,16 +2243,16 @@ program(1.0)
             tensor<int32, [1]> out_79_axes_0 = const()[name = tensor<string, []>("out_79_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3175_to_fp16 = const()[name = tensor<string, []>("op_3175_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_79_cast_fp16 = layer_norm(axes = out_79_axes_0, epsilon = var_3175_to_fp16, x = inputs_79_cast_fp16)[name = tensor<string, []>("out_79_cast_fp16")];
-            tensor<fp16, [1024]> inputs_81_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_81_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(137782592)))];
-            tensor<fp16, [1024]> inputs_81_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_81_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(137784704)))];
+            tensor<fp16, [1024]> inputs_81_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_81_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(189168512)))];
+            tensor<fp16, [1024]> inputs_81_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_81_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(189170624)))];
             tensor<fp16, []> inputs_81_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_81_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_81_cast_fp16 = batch_norm(beta = inputs_81_beta_0_to_fp16, epsilon = inputs_81_epsilon_0_to_fp16, gamma = inputs_81_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_79_cast_fp16)[name = tensor<string, []>("inputs_81_cast_fp16")];
             tensor<int32, []> var_3189 = const()[name = tensor<string, []>("op_3189"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_81_axes_0 = const()[name = tensor<string, []>("out_81_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3220_to_fp16 = const()[name = tensor<string, []>("op_3220_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_81_cast_fp16 = layer_norm(axes = out_81_axes_0, epsilon = var_3220_to_fp16, x = inputs_81_cast_fp16)[name = tensor<string, []>("out_81_cast_fp16")];
-            tensor<fp16, [1024]> input_225_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_225_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(137786816)))];
-            tensor<fp16, [1024]> input_225_beta_0_to_fp16 = const()[name = tensor<string, []>("input_225_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(137788928)))];
+            tensor<fp16, [1024]> input_225_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_225_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(189172736)))];
+            tensor<fp16, [1024]> input_225_beta_0_to_fp16 = const()[name = tensor<string, []>("input_225_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(189174848)))];
             tensor<fp16, []> input_225_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_225_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_225_cast_fp16 = batch_norm(beta = input_225_beta_0_to_fp16, epsilon = input_225_epsilon_0_to_fp16, gamma = input_225_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_81_cast_fp16)[name = tensor<string, []>("input_225_cast_fp16")];
             tensor<string, []> var_3240_pad_type_0 = const()[name = tensor<string, []>("op_3240_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2260,14 +2260,14 @@ program(1.0)
             tensor<int32, [4]> var_3240_pad_0 = const()[name = tensor<string, []>("op_3240_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3240_dilations_0 = const()[name = tensor<string, []>("op_3240_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3240_groups_0 = const()[name = tensor<string, []>("op_3240_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_8_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(137791040))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(139888256))), name = tensor<string, []>("layers_8_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_8_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(189176960))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(192322752))), name = tensor<string, []>("layers_8_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3240_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_3240_dilations_0, groups = var_3240_groups_0, pad = var_3240_pad_0, pad_type = var_3240_pad_type_0, strides = var_3240_strides_0, weight = layers_8_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_225_cast_fp16)[name = tensor<string, []>("op_3240_cast_fp16")];
             tensor<string, []> var_3246_pad_type_0 = const()[name = tensor<string, []>("op_3246_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3246_strides_0 = const()[name = tensor<string, []>("op_3246_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3246_pad_0 = const()[name = tensor<string, []>("op_3246_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3246_dilations_0 = const()[name = tensor<string, []>("op_3246_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3246_groups_0 = const()[name = tensor<string, []>("op_3246_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_8_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(140033792))), name = tensor<string, []>("layers_8_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72649]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(139888384))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_8_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(192468352))), name = tensor<string, []>("layers_8_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72649]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(192322944))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3246_cast_fp16 = conv(dilations = var_3246_dilations_0, groups = var_3246_groups_0, pad = var_3246_pad_0, pad_type = var_3246_pad_type_0, strides = var_3246_strides_0, weight = layers_8_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_225_cast_fp16)[name = tensor<string, []>("op_3246_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_227_cast_fp16 = add(x = var_3240_cast_fp16, y = var_3246_cast_fp16)[name = tensor<string, []>("input_227_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_229_cast_fp16 = silu(x = input_227_cast_fp16)[name = tensor<string, []>("input_229_cast_fp16")];
@@ -2276,14 +2276,14 @@ program(1.0)
             tensor<int32, [4]> var_3257_pad_0 = const()[name = tensor<string, []>("op_3257_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3257_dilations_0 = const()[name = tensor<string, []>("op_3257_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3257_groups_0 = const()[name = tensor<string, []>("op_3257_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_8_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(140558144))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(142655360))), name = tensor<string, []>("layers_8_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_8_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(192992704))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196138496))), name = tensor<string, []>("layers_8_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3257_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3257_dilations_0, groups = var_3257_groups_0, pad = var_3257_pad_0, pad_type = var_3257_pad_type_0, strides = var_3257_strides_0, weight = layers_8_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_229_cast_fp16)[name = tensor<string, []>("op_3257_cast_fp16")];
             tensor<string, []> var_3263_pad_type_0 = const()[name = tensor<string, []>("op_3263_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3263_strides_0 = const()[name = tensor<string, []>("op_3263_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3263_pad_0 = const()[name = tensor<string, []>("op_3263_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3263_dilations_0 = const()[name = tensor<string, []>("op_3263_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3263_groups_0 = const()[name = tensor<string, []>("op_3263_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_8_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(142819200))), name = tensor<string, []>("layers_8_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81821]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(142655488))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_8_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196302400))), name = tensor<string, []>("layers_8_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81821]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196138688))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3263_cast_fp16 = conv(dilations = var_3263_dilations_0, groups = var_3263_groups_0, pad = var_3263_pad_0, pad_type = var_3263_pad_type_0, strides = var_3263_strides_0, weight = layers_8_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_229_cast_fp16)[name = tensor<string, []>("op_3263_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_51_cast_fp16 = add(x = var_3257_cast_fp16, y = var_3263_cast_fp16)[name = tensor<string, []>("x_51_cast_fp16")];
             tensor<fp16, []> var_3265_to_fp16 = const()[name = tensor<string, []>("op_3265_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -2292,8 +2292,8 @@ program(1.0)
             tensor<int32, [1]> out_83_axes_0 = const()[name = tensor<string, []>("out_83_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3276_to_fp16 = const()[name = tensor<string, []>("op_3276_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_83_cast_fp16 = layer_norm(axes = out_83_axes_0, epsilon = var_3276_to_fp16, x = inputs_83_cast_fp16)[name = tensor<string, []>("out_83_cast_fp16")];
-            tensor<fp16, [1024]> obj_35_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_35_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143343552)))];
-            tensor<fp16, [1024]> obj_35_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_35_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143345664)))];
+            tensor<fp16, [1024]> obj_35_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_35_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196826752)))];
+            tensor<fp16, [1024]> obj_35_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_35_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196828864)))];
             tensor<fp16, []> obj_35_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_35_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_35_cast_fp16 = batch_norm(beta = obj_35_beta_0_to_fp16, epsilon = obj_35_epsilon_0_to_fp16, gamma = obj_35_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_83_cast_fp16)[name = tensor<string, []>("obj_35_cast_fp16")];
             tensor<string, []> var_3301_pad_type_0 = const()[name = tensor<string, []>("op_3301_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2301,14 +2301,14 @@ program(1.0)
             tensor<int32, [4]> var_3301_pad_0 = const()[name = tensor<string, []>("op_3301_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3301_dilations_0 = const()[name = tensor<string, []>("op_3301_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3301_groups_0 = const()[name = tensor<string, []>("op_3301_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143347776))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143872128))), name = tensor<string, []>("layers_8_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196830976))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(197617472))), name = tensor<string, []>("layers_8_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3301_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3301_dilations_0, groups = var_3301_groups_0, pad = var_3301_pad_0, pad_type = var_3301_pad_type_0, strides = var_3301_strides_0, weight = layers_8_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_35_cast_fp16)[name = tensor<string, []>("op_3301_cast_fp16")];
             tensor<string, []> var_3307_pad_type_0 = const()[name = tensor<string, []>("op_3307_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3307_strides_0 = const()[name = tensor<string, []>("op_3307_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3307_pad_0 = const()[name = tensor<string, []>("op_3307_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3307_dilations_0 = const()[name = tensor<string, []>("op_3307_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3307_groups_0 = const()[name = tensor<string, []>("op_3307_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143909312))), name = tensor<string, []>("layers_8_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18475]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(143872256))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(197654720))), name = tensor<string, []>("layers_8_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18475]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(197617664))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3307_cast_fp16 = conv(dilations = var_3307_dilations_0, groups = var_3307_groups_0, pad = var_3307_pad_0, pad_type = var_3307_pad_type_0, strides = var_3307_strides_0, weight = layers_8_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_35_cast_fp16)[name = tensor<string, []>("op_3307_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_33_cast_fp16 = add(x = var_3301_cast_fp16, y = var_3307_cast_fp16)[name = tensor<string, []>("query_33_cast_fp16")];
             tensor<string, []> var_3316_pad_type_0 = const()[name = tensor<string, []>("op_3316_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2316,14 +2316,14 @@ program(1.0)
             tensor<int32, [4]> var_3316_pad_0 = const()[name = tensor<string, []>("op_3316_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3316_dilations_0 = const()[name = tensor<string, []>("op_3316_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3316_groups_0 = const()[name = tensor<string, []>("op_3316_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(144040448))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(144564800))), name = tensor<string, []>("layers_8_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(197785856))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198572352))), name = tensor<string, []>("layers_8_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3316_cast_fp16 = conv(dilations = var_3316_dilations_0, groups = var_3316_groups_0, pad = var_3316_pad_0, pad_type = var_3316_pad_type_0, strides = var_3316_strides_0, weight = layers_8_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_35_cast_fp16)[name = tensor<string, []>("op_3316_cast_fp16")];
             tensor<string, []> var_3322_pad_type_0 = const()[name = tensor<string, []>("op_3322_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3322_strides_0 = const()[name = tensor<string, []>("op_3322_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3322_pad_0 = const()[name = tensor<string, []>("op_3322_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3322_dilations_0 = const()[name = tensor<string, []>("op_3322_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3322_groups_0 = const()[name = tensor<string, []>("op_3322_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(144614208))), name = tensor<string, []>("layers_8_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [24598]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(144564928))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198621824))), name = tensor<string, []>("layers_8_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [24598]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198572544))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3322_cast_fp16 = conv(dilations = var_3322_dilations_0, groups = var_3322_groups_0, pad = var_3322_pad_0, pad_type = var_3322_pad_type_0, strides = var_3322_strides_0, weight = layers_8_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_35_cast_fp16)[name = tensor<string, []>("op_3322_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_17_cast_fp16 = add(x = var_3316_cast_fp16, y = var_3322_cast_fp16)[name = tensor<string, []>("key_17_cast_fp16")];
             tensor<string, []> var_3332_pad_type_0 = const()[name = tensor<string, []>("op_3332_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2331,33 +2331,33 @@ program(1.0)
             tensor<int32, [4]> var_3332_pad_0 = const()[name = tensor<string, []>("op_3332_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3332_dilations_0 = const()[name = tensor<string, []>("op_3332_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3332_groups_0 = const()[name = tensor<string, []>("op_3332_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(144745344))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(145269696))), name = tensor<string, []>("layers_8_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198752960))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(199539456))), name = tensor<string, []>("layers_8_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3332_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3332_dilations_0, groups = var_3332_groups_0, pad = var_3332_pad_0, pad_type = var_3332_pad_type_0, strides = var_3332_strides_0, weight = layers_8_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_35_cast_fp16)[name = tensor<string, []>("op_3332_cast_fp16")];
             tensor<string, []> var_3338_pad_type_0 = const()[name = tensor<string, []>("op_3338_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3338_strides_0 = const()[name = tensor<string, []>("op_3338_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3338_pad_0 = const()[name = tensor<string, []>("op_3338_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3338_dilations_0 = const()[name = tensor<string, []>("op_3338_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3338_groups_0 = const()[name = tensor<string, []>("op_3338_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(145307200))), name = tensor<string, []>("layers_8_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18648]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(145269824))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(199577024))), name = tensor<string, []>("layers_8_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18648]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(199539648))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3338_cast_fp16 = conv(dilations = var_3338_dilations_0, groups = var_3338_groups_0, pad = var_3338_pad_0, pad_type = var_3338_pad_type_0, strides = var_3338_strides_0, weight = layers_8_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_35_cast_fp16)[name = tensor<string, []>("op_3338_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_17_cast_fp16 = add(x = var_3332_cast_fp16, y = var_3338_cast_fp16)[name = tensor<string, []>("value_17_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_3341_to_fp16 = const()[name = tensor<string, []>("op_3341_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(145438336)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_3341_to_fp16 = const()[name = tensor<string, []>("op_3341_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(199708160)))];
             tensor<fp16, [1, 1024, 1, 188]> query_35_cast_fp16 = add(x = query_33_cast_fp16, y = var_3341_to_fp16)[name = tensor<string, []>("query_35_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_3344_to_fp16 = const()[name = tensor<string, []>("op_3344_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(145440448)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_3344_to_fp16 = const()[name = tensor<string, []>("op_3344_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(199710272)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_17_cast_fp16 = add(x = query_33_cast_fp16, y = var_3344_to_fp16)[name = tensor<string, []>("q_with_bias_v_17_cast_fp16")];
             tensor<string, []> var_3354_pad_type_0 = const()[name = tensor<string, []>("op_3354_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3354_strides_0 = const()[name = tensor<string, []>("op_3354_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3354_pad_0 = const()[name = tensor<string, []>("op_3354_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3354_dilations_0 = const()[name = tensor<string, []>("op_3354_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3354_groups_0 = const()[name = tensor<string, []>("op_3354_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(145442560))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(145966912))), name = tensor<string, []>("layers_8_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(199712384))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(200498880))), name = tensor<string, []>("layers_8_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_3354_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3354_dilations_0, groups = var_3354_groups_0, pad = var_3354_pad_0, pad_type = var_3354_pad_type_0, strides = var_3354_strides_0, weight = layers_8_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_3354_cast_fp16")];
             tensor<string, []> var_3360_pad_type_0 = const()[name = tensor<string, []>("op_3360_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3360_strides_0 = const()[name = tensor<string, []>("op_3360_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3360_pad_0 = const()[name = tensor<string, []>("op_3360_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3360_dilations_0 = const()[name = tensor<string, []>("op_3360_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3360_groups_0 = const()[name = tensor<string, []>("op_3360_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146040192))), name = tensor<string, []>("layers_8_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [36544]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(145967040))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(200572224))), name = tensor<string, []>("layers_8_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [36544]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(200499072))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_3360_cast_fp16 = conv(dilations = var_3360_dilations_0, groups = var_3360_groups_0, pad = var_3360_pad_0, pad_type = var_3360_pad_type_0, strides = var_3360_strides_0, weight = layers_8_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_3360_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_17_cast_fp16 = add(x = var_3354_cast_fp16, y = var_3360_cast_fp16)[name = tensor<string, []>("p_17_cast_fp16")];
             tensor<int32, [4]> var_3364 = const()[name = tensor<string, []>("op_3364"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -2408,22 +2408,22 @@ program(1.0)
             tensor<int32, [4]> var_3417_pad_0 = const()[name = tensor<string, []>("op_3417_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3417_dilations_0 = const()[name = tensor<string, []>("op_3417_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3417_groups_0 = const()[name = tensor<string, []>("op_3417_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146171328))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146695680))), name = tensor<string, []>("layers_8_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(200703360))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201489856))), name = tensor<string, []>("layers_8_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3417_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3417_dilations_0, groups = var_3417_groups_0, pad = var_3417_pad_0, pad_type = var_3417_pad_type_0, strides = var_3417_strides_0, weight = layers_8_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_231_cast_fp16)[name = tensor<string, []>("op_3417_cast_fp16")];
             tensor<string, []> var_3423_pad_type_0 = const()[name = tensor<string, []>("op_3423_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3423_strides_0 = const()[name = tensor<string, []>("op_3423_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3423_pad_0 = const()[name = tensor<string, []>("op_3423_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3423_dilations_0 = const()[name = tensor<string, []>("op_3423_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3423_groups_0 = const()[name = tensor<string, []>("op_3423_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146727616))), name = tensor<string, []>("layers_8_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15868]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146695808))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201521856))), name = tensor<string, []>("layers_8_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15868]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201490048))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3423_cast_fp16 = conv(dilations = var_3423_dilations_0, groups = var_3423_groups_0, pad = var_3423_pad_0, pad_type = var_3423_pad_type_0, strides = var_3423_strides_0, weight = layers_8_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_231_cast_fp16)[name = tensor<string, []>("op_3423_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_37_cast_fp16 = add(x = var_3417_cast_fp16, y = var_3423_cast_fp16)[name = tensor<string, []>("obj_37_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_85_cast_fp16 = add(x = inputs_83_cast_fp16, y = obj_37_cast_fp16)[name = tensor<string, []>("inputs_85_cast_fp16")];
             tensor<int32, [1]> out_85_axes_0 = const()[name = tensor<string, []>("out_85_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3434_to_fp16 = const()[name = tensor<string, []>("op_3434_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_85_cast_fp16 = layer_norm(axes = out_85_axes_0, epsilon = var_3434_to_fp16, x = inputs_85_cast_fp16)[name = tensor<string, []>("out_85_cast_fp16")];
-            tensor<fp16, [1024]> input_233_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_233_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146858752)))];
-            tensor<fp16, [1024]> input_233_beta_0_to_fp16 = const()[name = tensor<string, []>("input_233_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146860864)))];
+            tensor<fp16, [1024]> input_233_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_233_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201652992)))];
+            tensor<fp16, [1024]> input_233_beta_0_to_fp16 = const()[name = tensor<string, []>("input_233_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201655104)))];
             tensor<fp16, []> input_233_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_233_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_233_cast_fp16 = batch_norm(beta = input_233_beta_0_to_fp16, epsilon = input_233_epsilon_0_to_fp16, gamma = input_233_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_85_cast_fp16)[name = tensor<string, []>("input_233_cast_fp16")];
             tensor<string, []> var_3455_pad_type_0 = const()[name = tensor<string, []>("op_3455_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2431,14 +2431,14 @@ program(1.0)
             tensor<int32, [4]> var_3455_pad_0 = const()[name = tensor<string, []>("op_3455_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3455_dilations_0 = const()[name = tensor<string, []>("op_3455_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3455_groups_0 = const()[name = tensor<string, []>("op_3455_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_8_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(146862976))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(147911616))), name = tensor<string, []>("layers_8_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_8_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201657216))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(203230144))), name = tensor<string, []>("layers_8_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_3455_cast_fp16 = conv(dilations = var_3455_dilations_0, groups = var_3455_groups_0, pad = var_3455_pad_0, pad_type = var_3455_pad_type_0, strides = var_3455_strides_0, weight = layers_8_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_233_cast_fp16)[name = tensor<string, []>("op_3455_cast_fp16")];
             tensor<string, []> var_3461_pad_type_0 = const()[name = tensor<string, []>("op_3461_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3461_strides_0 = const()[name = tensor<string, []>("op_3461_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3461_pad_0 = const()[name = tensor<string, []>("op_3461_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3461_dilations_0 = const()[name = tensor<string, []>("op_3461_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3461_groups_0 = const()[name = tensor<string, []>("op_3461_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_8_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(147972288))), name = tensor<string, []>("layers_8_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [30221]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(147911744))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_8_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(203290880))), name = tensor<string, []>("layers_8_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [30221]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(203230336))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_3461_cast_fp16 = conv(dilations = var_3461_dilations_0, groups = var_3461_groups_0, pad = var_3461_pad_0, pad_type = var_3461_pad_type_0, strides = var_3461_strides_0, weight = layers_8_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_233_cast_fp16)[name = tensor<string, []>("op_3461_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_235_cast_fp16 = add(x = var_3455_cast_fp16, y = var_3461_cast_fp16)[name = tensor<string, []>("input_235_cast_fp16")];
             tensor<int32, []> input_237_split_num_splits_0 = const()[name = tensor<string, []>("input_237_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -2451,8 +2451,8 @@ program(1.0)
             tensor<int32, []> input_239_groups_0 = const()[name = tensor<string, []>("input_239_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_239_strides_0 = const()[name = tensor<string, []>("input_239_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_239_dilations_0 = const()[name = tensor<string, []>("input_239_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_284_to_fp16 = const()[name = tensor<string, []>("const_284_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(148234496)))];
-            tensor<fp16, [1024]> const_285_to_fp16 = const()[name = tensor<string, []>("const_285_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(148252992)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_284_to_fp16 = const()[name = tensor<string, []>("const_284_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(203553088)))];
+            tensor<fp16, [1024]> const_285_to_fp16 = const()[name = tensor<string, []>("const_285_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(203571584)))];
             tensor<fp16, [1, 1024, 1, 188]> input_241_cast_fp16 = conv(bias = const_285_to_fp16, dilations = input_239_dilations_0, groups = input_239_groups_0, pad = input_239_pad_0, pad_type = input_239_pad_type_0, strides = input_239_strides_0, weight = const_284_to_fp16, x = input_237_cast_fp16)[name = tensor<string, []>("input_241_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_243_cast_fp16 = silu(x = input_241_cast_fp16)[name = tensor<string, []>("input_243_cast_fp16")];
             tensor<string, []> var_3483_pad_type_0 = const()[name = tensor<string, []>("op_3483_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2460,22 +2460,22 @@ program(1.0)
             tensor<int32, [4]> var_3483_pad_0 = const()[name = tensor<string, []>("op_3483_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3483_dilations_0 = const()[name = tensor<string, []>("op_3483_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3483_groups_0 = const()[name = tensor<string, []>("op_3483_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(148255104))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(148779456))), name = tensor<string, []>("layers_8_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(203573696))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204360192))), name = tensor<string, []>("layers_8_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3483_cast_fp16 = conv(dilations = var_3483_dilations_0, groups = var_3483_groups_0, pad = var_3483_pad_0, pad_type = var_3483_pad_type_0, strides = var_3483_strides_0, weight = layers_8_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_243_cast_fp16)[name = tensor<string, []>("op_3483_cast_fp16")];
             tensor<string, []> var_3489_pad_type_0 = const()[name = tensor<string, []>("op_3489_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3489_strides_0 = const()[name = tensor<string, []>("op_3489_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3489_pad_0 = const()[name = tensor<string, []>("op_3489_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3489_dilations_0 = const()[name = tensor<string, []>("op_3489_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3489_groups_0 = const()[name = tensor<string, []>("op_3489_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_8_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(148809664))), name = tensor<string, []>("layers_8_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15006]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(148779584))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_8_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204390464))), name = tensor<string, []>("layers_8_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15006]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204360384))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3489_cast_fp16 = conv(dilations = var_3489_dilations_0, groups = var_3489_groups_0, pad = var_3489_pad_0, pad_type = var_3489_pad_type_0, strides = var_3489_strides_0, weight = layers_8_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_243_cast_fp16)[name = tensor<string, []>("op_3489_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_53_cast_fp16 = add(x = var_3483_cast_fp16, y = var_3489_cast_fp16)[name = tensor<string, []>("x_53_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_87_cast_fp16 = add(x = inputs_85_cast_fp16, y = x_53_cast_fp16)[name = tensor<string, []>("inputs_87_cast_fp16")];
             tensor<int32, [1]> out_87_axes_0 = const()[name = tensor<string, []>("out_87_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3500_to_fp16 = const()[name = tensor<string, []>("op_3500_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_87_cast_fp16 = layer_norm(axes = out_87_axes_0, epsilon = var_3500_to_fp16, x = inputs_87_cast_fp16)[name = tensor<string, []>("out_87_cast_fp16")];
-            tensor<fp16, [1024]> input_245_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_245_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(148940800)))];
-            tensor<fp16, [1024]> input_245_beta_0_to_fp16 = const()[name = tensor<string, []>("input_245_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(148942912)))];
+            tensor<fp16, [1024]> input_245_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_245_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204521600)))];
+            tensor<fp16, [1024]> input_245_beta_0_to_fp16 = const()[name = tensor<string, []>("input_245_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204523712)))];
             tensor<fp16, []> input_245_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_245_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_245_cast_fp16 = batch_norm(beta = input_245_beta_0_to_fp16, epsilon = input_245_epsilon_0_to_fp16, gamma = input_245_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_87_cast_fp16)[name = tensor<string, []>("input_245_cast_fp16")];
             tensor<string, []> var_3520_pad_type_0 = const()[name = tensor<string, []>("op_3520_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2483,14 +2483,14 @@ program(1.0)
             tensor<int32, [4]> var_3520_pad_0 = const()[name = tensor<string, []>("op_3520_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3520_dilations_0 = const()[name = tensor<string, []>("op_3520_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3520_groups_0 = const()[name = tensor<string, []>("op_3520_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_8_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(148945024))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151042240))), name = tensor<string, []>("layers_8_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_8_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204525824))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(207671616))), name = tensor<string, []>("layers_8_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3520_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_3520_dilations_0, groups = var_3520_groups_0, pad = var_3520_pad_0, pad_type = var_3520_pad_type_0, strides = var_3520_strides_0, weight = layers_8_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_245_cast_fp16)[name = tensor<string, []>("op_3520_cast_fp16")];
             tensor<string, []> var_3526_pad_type_0 = const()[name = tensor<string, []>("op_3526_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3526_strides_0 = const()[name = tensor<string, []>("op_3526_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3526_pad_0 = const()[name = tensor<string, []>("op_3526_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3526_dilations_0 = const()[name = tensor<string, []>("op_3526_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3526_groups_0 = const()[name = tensor<string, []>("op_3526_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_8_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151190656))), name = tensor<string, []>("layers_8_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [74092]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151042368))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_8_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(207820096))), name = tensor<string, []>("layers_8_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [74092]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(207671808))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3526_cast_fp16 = conv(dilations = var_3526_dilations_0, groups = var_3526_groups_0, pad = var_3526_pad_0, pad_type = var_3526_pad_type_0, strides = var_3526_strides_0, weight = layers_8_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_245_cast_fp16)[name = tensor<string, []>("op_3526_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_247_cast_fp16 = add(x = var_3520_cast_fp16, y = var_3526_cast_fp16)[name = tensor<string, []>("input_247_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_249_cast_fp16 = silu(x = input_247_cast_fp16)[name = tensor<string, []>("input_249_cast_fp16")];
@@ -2499,14 +2499,14 @@ program(1.0)
             tensor<int32, [4]> var_3537_pad_0 = const()[name = tensor<string, []>("op_3537_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3537_dilations_0 = const()[name = tensor<string, []>("op_3537_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3537_groups_0 = const()[name = tensor<string, []>("op_3537_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_8_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(151715008))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(153812224))), name = tensor<string, []>("layers_8_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_8_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(208344448))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(211490240))), name = tensor<string, []>("layers_8_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3537_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3537_dilations_0, groups = var_3537_groups_0, pad = var_3537_pad_0, pad_type = var_3537_pad_type_0, strides = var_3537_strides_0, weight = layers_8_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_249_cast_fp16)[name = tensor<string, []>("op_3537_cast_fp16")];
             tensor<string, []> var_3543_pad_type_0 = const()[name = tensor<string, []>("op_3543_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3543_strides_0 = const()[name = tensor<string, []>("op_3543_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3543_pad_0 = const()[name = tensor<string, []>("op_3543_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3543_dilations_0 = const()[name = tensor<string, []>("op_3543_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3543_groups_0 = const()[name = tensor<string, []>("op_3543_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_8_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(153954816))), name = tensor<string, []>("layers_8_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [71186]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(153812352))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_8_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(211632896))), name = tensor<string, []>("layers_8_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [71186]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(211490432))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3543_cast_fp16 = conv(dilations = var_3543_dilations_0, groups = var_3543_groups_0, pad = var_3543_pad_0, pad_type = var_3543_pad_type_0, strides = var_3543_strides_0, weight = layers_8_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_249_cast_fp16)[name = tensor<string, []>("op_3543_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_55_cast_fp16 = add(x = var_3537_cast_fp16, y = var_3543_cast_fp16)[name = tensor<string, []>("x_55_cast_fp16")];
             tensor<fp16, []> var_3545_to_fp16 = const()[name = tensor<string, []>("op_3545_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -2515,16 +2515,16 @@ program(1.0)
             tensor<int32, [1]> out_89_axes_0 = const()[name = tensor<string, []>("out_89_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3556_to_fp16 = const()[name = tensor<string, []>("op_3556_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_89_cast_fp16 = layer_norm(axes = out_89_axes_0, epsilon = var_3556_to_fp16, x = inputs_89_cast_fp16)[name = tensor<string, []>("out_89_cast_fp16")];
-            tensor<fp16, [1024]> inputs_91_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_91_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(154479168)))];
-            tensor<fp16, [1024]> inputs_91_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_91_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(154481280)))];
+            tensor<fp16, [1024]> inputs_91_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_91_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212157248)))];
+            tensor<fp16, [1024]> inputs_91_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_91_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212159360)))];
             tensor<fp16, []> inputs_91_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_91_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_91_cast_fp16 = batch_norm(beta = inputs_91_beta_0_to_fp16, epsilon = inputs_91_epsilon_0_to_fp16, gamma = inputs_91_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_89_cast_fp16)[name = tensor<string, []>("inputs_91_cast_fp16")];
             tensor<int32, []> var_3570 = const()[name = tensor<string, []>("op_3570"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_91_axes_0 = const()[name = tensor<string, []>("out_91_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3601_to_fp16 = const()[name = tensor<string, []>("op_3601_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_91_cast_fp16 = layer_norm(axes = out_91_axes_0, epsilon = var_3601_to_fp16, x = inputs_91_cast_fp16)[name = tensor<string, []>("out_91_cast_fp16")];
-            tensor<fp16, [1024]> input_251_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_251_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(154483392)))];
-            tensor<fp16, [1024]> input_251_beta_0_to_fp16 = const()[name = tensor<string, []>("input_251_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(154485504)))];
+            tensor<fp16, [1024]> input_251_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_251_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212161472)))];
+            tensor<fp16, [1024]> input_251_beta_0_to_fp16 = const()[name = tensor<string, []>("input_251_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212163584)))];
             tensor<fp16, []> input_251_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_251_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_251_cast_fp16 = batch_norm(beta = input_251_beta_0_to_fp16, epsilon = input_251_epsilon_0_to_fp16, gamma = input_251_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_91_cast_fp16)[name = tensor<string, []>("input_251_cast_fp16")];
             tensor<string, []> var_3621_pad_type_0 = const()[name = tensor<string, []>("op_3621_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2532,14 +2532,14 @@ program(1.0)
             tensor<int32, [4]> var_3621_pad_0 = const()[name = tensor<string, []>("op_3621_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3621_dilations_0 = const()[name = tensor<string, []>("op_3621_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3621_groups_0 = const()[name = tensor<string, []>("op_3621_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_9_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(154487616))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(156584832))), name = tensor<string, []>("layers_9_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_9_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212165696))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215311488))), name = tensor<string, []>("layers_9_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3621_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_3621_dilations_0, groups = var_3621_groups_0, pad = var_3621_pad_0, pad_type = var_3621_pad_type_0, strides = var_3621_strides_0, weight = layers_9_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_251_cast_fp16)[name = tensor<string, []>("op_3621_cast_fp16")];
             tensor<string, []> var_3627_pad_type_0 = const()[name = tensor<string, []>("op_3627_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3627_strides_0 = const()[name = tensor<string, []>("op_3627_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3627_pad_0 = const()[name = tensor<string, []>("op_3627_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3627_dilations_0 = const()[name = tensor<string, []>("op_3627_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3627_groups_0 = const()[name = tensor<string, []>("op_3627_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_9_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(156742784))), name = tensor<string, []>("layers_9_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [78865]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(156584960))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_9_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215469504))), name = tensor<string, []>("layers_9_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [78865]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215311680))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3627_cast_fp16 = conv(dilations = var_3627_dilations_0, groups = var_3627_groups_0, pad = var_3627_pad_0, pad_type = var_3627_pad_type_0, strides = var_3627_strides_0, weight = layers_9_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_251_cast_fp16)[name = tensor<string, []>("op_3627_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_253_cast_fp16 = add(x = var_3621_cast_fp16, y = var_3627_cast_fp16)[name = tensor<string, []>("input_253_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_255_cast_fp16 = silu(x = input_253_cast_fp16)[name = tensor<string, []>("input_255_cast_fp16")];
@@ -2548,14 +2548,14 @@ program(1.0)
             tensor<int32, [4]> var_3638_pad_0 = const()[name = tensor<string, []>("op_3638_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3638_dilations_0 = const()[name = tensor<string, []>("op_3638_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3638_groups_0 = const()[name = tensor<string, []>("op_3638_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_9_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(157267136))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(159364352))), name = tensor<string, []>("layers_9_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_9_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215993856))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(219139648))), name = tensor<string, []>("layers_9_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3638_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3638_dilations_0, groups = var_3638_groups_0, pad = var_3638_pad_0, pad_type = var_3638_pad_type_0, strides = var_3638_strides_0, weight = layers_9_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_255_cast_fp16)[name = tensor<string, []>("op_3638_cast_fp16")];
             tensor<string, []> var_3644_pad_type_0 = const()[name = tensor<string, []>("op_3644_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3644_strides_0 = const()[name = tensor<string, []>("op_3644_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3644_pad_0 = const()[name = tensor<string, []>("op_3644_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3644_dilations_0 = const()[name = tensor<string, []>("op_3644_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3644_groups_0 = const()[name = tensor<string, []>("op_3644_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_9_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(159526208))), name = tensor<string, []>("layers_9_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [80809]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(159364480))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_9_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(219301568))), name = tensor<string, []>("layers_9_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [80809]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(219139840))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3644_cast_fp16 = conv(dilations = var_3644_dilations_0, groups = var_3644_groups_0, pad = var_3644_pad_0, pad_type = var_3644_pad_type_0, strides = var_3644_strides_0, weight = layers_9_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_255_cast_fp16)[name = tensor<string, []>("op_3644_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_57_cast_fp16 = add(x = var_3638_cast_fp16, y = var_3644_cast_fp16)[name = tensor<string, []>("x_57_cast_fp16")];
             tensor<fp16, []> var_3646_to_fp16 = const()[name = tensor<string, []>("op_3646_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -2564,8 +2564,8 @@ program(1.0)
             tensor<int32, [1]> out_93_axes_0 = const()[name = tensor<string, []>("out_93_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3657_to_fp16 = const()[name = tensor<string, []>("op_3657_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_93_cast_fp16 = layer_norm(axes = out_93_axes_0, epsilon = var_3657_to_fp16, x = inputs_93_cast_fp16)[name = tensor<string, []>("out_93_cast_fp16")];
-            tensor<fp16, [1024]> obj_39_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_39_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(160050560)))];
-            tensor<fp16, [1024]> obj_39_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_39_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(160052672)))];
+            tensor<fp16, [1024]> obj_39_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_39_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(219825920)))];
+            tensor<fp16, [1024]> obj_39_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_39_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(219828032)))];
             tensor<fp16, []> obj_39_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_39_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_39_cast_fp16 = batch_norm(beta = obj_39_beta_0_to_fp16, epsilon = obj_39_epsilon_0_to_fp16, gamma = obj_39_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_93_cast_fp16)[name = tensor<string, []>("obj_39_cast_fp16")];
             tensor<string, []> var_3682_pad_type_0 = const()[name = tensor<string, []>("op_3682_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2573,14 +2573,14 @@ program(1.0)
             tensor<int32, [4]> var_3682_pad_0 = const()[name = tensor<string, []>("op_3682_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3682_dilations_0 = const()[name = tensor<string, []>("op_3682_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3682_groups_0 = const()[name = tensor<string, []>("op_3682_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(160054784))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(160579136))), name = tensor<string, []>("layers_9_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(219830144))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220616640))), name = tensor<string, []>("layers_9_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3682_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3682_dilations_0, groups = var_3682_groups_0, pad = var_3682_pad_0, pad_type = var_3682_pad_type_0, strides = var_3682_strides_0, weight = layers_9_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_39_cast_fp16)[name = tensor<string, []>("op_3682_cast_fp16")];
             tensor<string, []> var_3688_pad_type_0 = const()[name = tensor<string, []>("op_3688_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3688_strides_0 = const()[name = tensor<string, []>("op_3688_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3688_pad_0 = const()[name = tensor<string, []>("op_3688_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3688_dilations_0 = const()[name = tensor<string, []>("op_3688_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3688_groups_0 = const()[name = tensor<string, []>("op_3688_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(160618816))), name = tensor<string, []>("layers_9_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19714]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(160579264))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220656384))), name = tensor<string, []>("layers_9_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19714]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220616832))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3688_cast_fp16 = conv(dilations = var_3688_dilations_0, groups = var_3688_groups_0, pad = var_3688_pad_0, pad_type = var_3688_pad_type_0, strides = var_3688_strides_0, weight = layers_9_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_39_cast_fp16)[name = tensor<string, []>("op_3688_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_37_cast_fp16 = add(x = var_3682_cast_fp16, y = var_3688_cast_fp16)[name = tensor<string, []>("query_37_cast_fp16")];
             tensor<string, []> var_3697_pad_type_0 = const()[name = tensor<string, []>("op_3697_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2588,14 +2588,14 @@ program(1.0)
             tensor<int32, [4]> var_3697_pad_0 = const()[name = tensor<string, []>("op_3697_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3697_dilations_0 = const()[name = tensor<string, []>("op_3697_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3697_groups_0 = const()[name = tensor<string, []>("op_3697_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(160749952))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(161274304))), name = tensor<string, []>("layers_9_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220787520))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(221574016))), name = tensor<string, []>("layers_9_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3697_cast_fp16 = conv(dilations = var_3697_dilations_0, groups = var_3697_groups_0, pad = var_3697_pad_0, pad_type = var_3697_pad_type_0, strides = var_3697_strides_0, weight = layers_9_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_39_cast_fp16)[name = tensor<string, []>("op_3697_cast_fp16")];
             tensor<string, []> var_3703_pad_type_0 = const()[name = tensor<string, []>("op_3703_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3703_strides_0 = const()[name = tensor<string, []>("op_3703_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3703_pad_0 = const()[name = tensor<string, []>("op_3703_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3703_dilations_0 = const()[name = tensor<string, []>("op_3703_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3703_groups_0 = const()[name = tensor<string, []>("op_3703_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(161332736))), name = tensor<string, []>("layers_9_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [29108]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(161274432))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(221632512))), name = tensor<string, []>("layers_9_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [29108]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(221574208))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3703_cast_fp16 = conv(dilations = var_3703_dilations_0, groups = var_3703_groups_0, pad = var_3703_pad_0, pad_type = var_3703_pad_type_0, strides = var_3703_strides_0, weight = layers_9_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_39_cast_fp16)[name = tensor<string, []>("op_3703_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_19_cast_fp16 = add(x = var_3697_cast_fp16, y = var_3703_cast_fp16)[name = tensor<string, []>("key_19_cast_fp16")];
             tensor<string, []> var_3713_pad_type_0 = const()[name = tensor<string, []>("op_3713_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2603,33 +2603,33 @@ program(1.0)
             tensor<int32, [4]> var_3713_pad_0 = const()[name = tensor<string, []>("op_3713_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3713_dilations_0 = const()[name = tensor<string, []>("op_3713_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3713_groups_0 = const()[name = tensor<string, []>("op_3713_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(161463872))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(161988224))), name = tensor<string, []>("layers_9_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(221763648))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(222550144))), name = tensor<string, []>("layers_9_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3713_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3713_dilations_0, groups = var_3713_groups_0, pad = var_3713_pad_0, pad_type = var_3713_pad_type_0, strides = var_3713_strides_0, weight = layers_9_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_39_cast_fp16)[name = tensor<string, []>("op_3713_cast_fp16")];
             tensor<string, []> var_3719_pad_type_0 = const()[name = tensor<string, []>("op_3719_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3719_strides_0 = const()[name = tensor<string, []>("op_3719_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3719_pad_0 = const()[name = tensor<string, []>("op_3719_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3719_dilations_0 = const()[name = tensor<string, []>("op_3719_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3719_groups_0 = const()[name = tensor<string, []>("op_3719_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(162019584))), name = tensor<string, []>("layers_9_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15563]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(161988352))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(222581568))), name = tensor<string, []>("layers_9_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15563]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(222550336))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3719_cast_fp16 = conv(dilations = var_3719_dilations_0, groups = var_3719_groups_0, pad = var_3719_pad_0, pad_type = var_3719_pad_type_0, strides = var_3719_strides_0, weight = layers_9_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_39_cast_fp16)[name = tensor<string, []>("op_3719_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_19_cast_fp16 = add(x = var_3713_cast_fp16, y = var_3719_cast_fp16)[name = tensor<string, []>("value_19_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_3722_to_fp16 = const()[name = tensor<string, []>("op_3722_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(162150720)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_3722_to_fp16 = const()[name = tensor<string, []>("op_3722_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(222712704)))];
             tensor<fp16, [1, 1024, 1, 188]> query_39_cast_fp16 = add(x = query_37_cast_fp16, y = var_3722_to_fp16)[name = tensor<string, []>("query_39_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_3725_to_fp16 = const()[name = tensor<string, []>("op_3725_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(162152832)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_3725_to_fp16 = const()[name = tensor<string, []>("op_3725_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(222714816)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_19_cast_fp16 = add(x = query_37_cast_fp16, y = var_3725_to_fp16)[name = tensor<string, []>("q_with_bias_v_19_cast_fp16")];
             tensor<string, []> var_3735_pad_type_0 = const()[name = tensor<string, []>("op_3735_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3735_strides_0 = const()[name = tensor<string, []>("op_3735_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3735_pad_0 = const()[name = tensor<string, []>("op_3735_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3735_dilations_0 = const()[name = tensor<string, []>("op_3735_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3735_groups_0 = const()[name = tensor<string, []>("op_3735_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(162154944))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(162679296))), name = tensor<string, []>("layers_9_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(222716928))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(223503424))), name = tensor<string, []>("layers_9_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_3735_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3735_dilations_0, groups = var_3735_groups_0, pad = var_3735_pad_0, pad_type = var_3735_pad_type_0, strides = var_3735_strides_0, weight = layers_9_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_3735_cast_fp16")];
             tensor<string, []> var_3741_pad_type_0 = const()[name = tensor<string, []>("op_3741_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3741_strides_0 = const()[name = tensor<string, []>("op_3741_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3741_pad_0 = const()[name = tensor<string, []>("op_3741_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3741_dilations_0 = const()[name = tensor<string, []>("op_3741_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3741_groups_0 = const()[name = tensor<string, []>("op_3741_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(162742848))), name = tensor<string, []>("layers_9_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31665]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(162679424))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(223567040))), name = tensor<string, []>("layers_9_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31665]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(223503616))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_3741_cast_fp16 = conv(dilations = var_3741_dilations_0, groups = var_3741_groups_0, pad = var_3741_pad_0, pad_type = var_3741_pad_type_0, strides = var_3741_strides_0, weight = layers_9_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_3741_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_19_cast_fp16 = add(x = var_3735_cast_fp16, y = var_3741_cast_fp16)[name = tensor<string, []>("p_19_cast_fp16")];
             tensor<int32, [4]> var_3745 = const()[name = tensor<string, []>("op_3745"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -2680,22 +2680,22 @@ program(1.0)
             tensor<int32, [4]> var_3798_pad_0 = const()[name = tensor<string, []>("op_3798_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3798_dilations_0 = const()[name = tensor<string, []>("op_3798_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3798_groups_0 = const()[name = tensor<string, []>("op_3798_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(162873984))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(163398336))), name = tensor<string, []>("layers_9_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(223698176))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(224484672))), name = tensor<string, []>("layers_9_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3798_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3798_dilations_0, groups = var_3798_groups_0, pad = var_3798_pad_0, pad_type = var_3798_pad_type_0, strides = var_3798_strides_0, weight = layers_9_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_257_cast_fp16)[name = tensor<string, []>("op_3798_cast_fp16")];
             tensor<string, []> var_3804_pad_type_0 = const()[name = tensor<string, []>("op_3804_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3804_strides_0 = const()[name = tensor<string, []>("op_3804_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3804_pad_0 = const()[name = tensor<string, []>("op_3804_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3804_dilations_0 = const()[name = tensor<string, []>("op_3804_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3804_groups_0 = const()[name = tensor<string, []>("op_3804_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(163428608))), name = tensor<string, []>("layers_9_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15012]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(163398464))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(224515008))), name = tensor<string, []>("layers_9_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15012]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(224484864))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3804_cast_fp16 = conv(dilations = var_3804_dilations_0, groups = var_3804_groups_0, pad = var_3804_pad_0, pad_type = var_3804_pad_type_0, strides = var_3804_strides_0, weight = layers_9_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_257_cast_fp16)[name = tensor<string, []>("op_3804_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_41_cast_fp16 = add(x = var_3798_cast_fp16, y = var_3804_cast_fp16)[name = tensor<string, []>("obj_41_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_95_cast_fp16 = add(x = inputs_93_cast_fp16, y = obj_41_cast_fp16)[name = tensor<string, []>("inputs_95_cast_fp16")];
             tensor<int32, [1]> out_95_axes_0 = const()[name = tensor<string, []>("out_95_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3815_to_fp16 = const()[name = tensor<string, []>("op_3815_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_95_cast_fp16 = layer_norm(axes = out_95_axes_0, epsilon = var_3815_to_fp16, x = inputs_95_cast_fp16)[name = tensor<string, []>("out_95_cast_fp16")];
-            tensor<fp16, [1024]> input_259_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_259_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(163559744)))];
-            tensor<fp16, [1024]> input_259_beta_0_to_fp16 = const()[name = tensor<string, []>("input_259_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(163561856)))];
+            tensor<fp16, [1024]> input_259_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_259_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(224646144)))];
+            tensor<fp16, [1024]> input_259_beta_0_to_fp16 = const()[name = tensor<string, []>("input_259_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(224648256)))];
             tensor<fp16, []> input_259_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_259_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_259_cast_fp16 = batch_norm(beta = input_259_beta_0_to_fp16, epsilon = input_259_epsilon_0_to_fp16, gamma = input_259_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_95_cast_fp16)[name = tensor<string, []>("input_259_cast_fp16")];
             tensor<string, []> var_3836_pad_type_0 = const()[name = tensor<string, []>("op_3836_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2703,14 +2703,14 @@ program(1.0)
             tensor<int32, [4]> var_3836_pad_0 = const()[name = tensor<string, []>("op_3836_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3836_dilations_0 = const()[name = tensor<string, []>("op_3836_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3836_groups_0 = const()[name = tensor<string, []>("op_3836_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_9_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(163563968))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(164612608))), name = tensor<string, []>("layers_9_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_9_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(224650368))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226223296))), name = tensor<string, []>("layers_9_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_3836_cast_fp16 = conv(dilations = var_3836_dilations_0, groups = var_3836_groups_0, pad = var_3836_pad_0, pad_type = var_3836_pad_type_0, strides = var_3836_strides_0, weight = layers_9_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_259_cast_fp16)[name = tensor<string, []>("op_3836_cast_fp16")];
             tensor<string, []> var_3842_pad_type_0 = const()[name = tensor<string, []>("op_3842_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3842_strides_0 = const()[name = tensor<string, []>("op_3842_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3842_pad_0 = const()[name = tensor<string, []>("op_3842_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3842_dilations_0 = const()[name = tensor<string, []>("op_3842_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3842_groups_0 = const()[name = tensor<string, []>("op_3842_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_9_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(164672576))), name = tensor<string, []>("layers_9_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [29883]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(164612736))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_9_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226283328))), name = tensor<string, []>("layers_9_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [29883]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226223488))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_3842_cast_fp16 = conv(dilations = var_3842_dilations_0, groups = var_3842_groups_0, pad = var_3842_pad_0, pad_type = var_3842_pad_type_0, strides = var_3842_strides_0, weight = layers_9_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_259_cast_fp16)[name = tensor<string, []>("op_3842_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_261_cast_fp16 = add(x = var_3836_cast_fp16, y = var_3842_cast_fp16)[name = tensor<string, []>("input_261_cast_fp16")];
             tensor<int32, []> input_263_split_num_splits_0 = const()[name = tensor<string, []>("input_263_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -2723,8 +2723,8 @@ program(1.0)
             tensor<int32, []> input_265_groups_0 = const()[name = tensor<string, []>("input_265_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_265_strides_0 = const()[name = tensor<string, []>("input_265_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_265_dilations_0 = const()[name = tensor<string, []>("input_265_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_286_to_fp16 = const()[name = tensor<string, []>("const_286_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(164934784)))];
-            tensor<fp16, [1024]> const_287_to_fp16 = const()[name = tensor<string, []>("const_287_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(164953280)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_286_to_fp16 = const()[name = tensor<string, []>("const_286_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226545536)))];
+            tensor<fp16, [1024]> const_287_to_fp16 = const()[name = tensor<string, []>("const_287_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226564032)))];
             tensor<fp16, [1, 1024, 1, 188]> input_267_cast_fp16 = conv(bias = const_287_to_fp16, dilations = input_265_dilations_0, groups = input_265_groups_0, pad = input_265_pad_0, pad_type = input_265_pad_type_0, strides = input_265_strides_0, weight = const_286_to_fp16, x = input_263_cast_fp16)[name = tensor<string, []>("input_267_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_269_cast_fp16 = silu(x = input_267_cast_fp16)[name = tensor<string, []>("input_269_cast_fp16")];
             tensor<string, []> var_3864_pad_type_0 = const()[name = tensor<string, []>("op_3864_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2732,22 +2732,22 @@ program(1.0)
             tensor<int32, [4]> var_3864_pad_0 = const()[name = tensor<string, []>("op_3864_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3864_dilations_0 = const()[name = tensor<string, []>("op_3864_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3864_groups_0 = const()[name = tensor<string, []>("op_3864_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(164955392))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(165479744))), name = tensor<string, []>("layers_9_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226566144))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227352640))), name = tensor<string, []>("layers_9_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3864_cast_fp16 = conv(dilations = var_3864_dilations_0, groups = var_3864_groups_0, pad = var_3864_pad_0, pad_type = var_3864_pad_type_0, strides = var_3864_strides_0, weight = layers_9_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_269_cast_fp16)[name = tensor<string, []>("op_3864_cast_fp16")];
             tensor<string, []> var_3870_pad_type_0 = const()[name = tensor<string, []>("op_3870_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3870_strides_0 = const()[name = tensor<string, []>("op_3870_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3870_pad_0 = const()[name = tensor<string, []>("op_3870_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3870_dilations_0 = const()[name = tensor<string, []>("op_3870_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3870_groups_0 = const()[name = tensor<string, []>("op_3870_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_9_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(165511232))), name = tensor<string, []>("layers_9_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15645]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(165479872))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_9_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227384192))), name = tensor<string, []>("layers_9_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15645]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227352832))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3870_cast_fp16 = conv(dilations = var_3870_dilations_0, groups = var_3870_groups_0, pad = var_3870_pad_0, pad_type = var_3870_pad_type_0, strides = var_3870_strides_0, weight = layers_9_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_269_cast_fp16)[name = tensor<string, []>("op_3870_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_59_cast_fp16 = add(x = var_3864_cast_fp16, y = var_3870_cast_fp16)[name = tensor<string, []>("x_59_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_97_cast_fp16 = add(x = inputs_95_cast_fp16, y = x_59_cast_fp16)[name = tensor<string, []>("inputs_97_cast_fp16")];
             tensor<int32, [1]> out_97_axes_0 = const()[name = tensor<string, []>("out_97_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3881_to_fp16 = const()[name = tensor<string, []>("op_3881_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_97_cast_fp16 = layer_norm(axes = out_97_axes_0, epsilon = var_3881_to_fp16, x = inputs_97_cast_fp16)[name = tensor<string, []>("out_97_cast_fp16")];
-            tensor<fp16, [1024]> input_271_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_271_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(165642368)))];
-            tensor<fp16, [1024]> input_271_beta_0_to_fp16 = const()[name = tensor<string, []>("input_271_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(165644480)))];
+            tensor<fp16, [1024]> input_271_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_271_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227515328)))];
+            tensor<fp16, [1024]> input_271_beta_0_to_fp16 = const()[name = tensor<string, []>("input_271_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227517440)))];
             tensor<fp16, []> input_271_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_271_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_271_cast_fp16 = batch_norm(beta = input_271_beta_0_to_fp16, epsilon = input_271_epsilon_0_to_fp16, gamma = input_271_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_97_cast_fp16)[name = tensor<string, []>("input_271_cast_fp16")];
             tensor<string, []> var_3901_pad_type_0 = const()[name = tensor<string, []>("op_3901_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2755,14 +2755,14 @@ program(1.0)
             tensor<int32, [4]> var_3901_pad_0 = const()[name = tensor<string, []>("op_3901_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3901_dilations_0 = const()[name = tensor<string, []>("op_3901_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3901_groups_0 = const()[name = tensor<string, []>("op_3901_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_9_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(165646592))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(167743808))), name = tensor<string, []>("layers_9_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_9_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227519552))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(230665344))), name = tensor<string, []>("layers_9_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3901_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_3901_dilations_0, groups = var_3901_groups_0, pad = var_3901_pad_0, pad_type = var_3901_pad_type_0, strides = var_3901_strides_0, weight = layers_9_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_271_cast_fp16)[name = tensor<string, []>("op_3901_cast_fp16")];
             tensor<string, []> var_3907_pad_type_0 = const()[name = tensor<string, []>("op_3907_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3907_strides_0 = const()[name = tensor<string, []>("op_3907_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3907_pad_0 = const()[name = tensor<string, []>("op_3907_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3907_dilations_0 = const()[name = tensor<string, []>("op_3907_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3907_groups_0 = const()[name = tensor<string, []>("op_3907_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_9_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(167895424))), name = tensor<string, []>("layers_9_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75682]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(167743936))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_9_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(230817024))), name = tensor<string, []>("layers_9_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75682]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(230665536))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_3907_cast_fp16 = conv(dilations = var_3907_dilations_0, groups = var_3907_groups_0, pad = var_3907_pad_0, pad_type = var_3907_pad_type_0, strides = var_3907_strides_0, weight = layers_9_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_271_cast_fp16)[name = tensor<string, []>("op_3907_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_273_cast_fp16 = add(x = var_3901_cast_fp16, y = var_3907_cast_fp16)[name = tensor<string, []>("input_273_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_275_cast_fp16 = silu(x = input_273_cast_fp16)[name = tensor<string, []>("input_275_cast_fp16")];
@@ -2771,14 +2771,14 @@ program(1.0)
             tensor<int32, [4]> var_3918_pad_0 = const()[name = tensor<string, []>("op_3918_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3918_dilations_0 = const()[name = tensor<string, []>("op_3918_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3918_groups_0 = const()[name = tensor<string, []>("op_3918_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_9_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(168419776))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(170516992))), name = tensor<string, []>("layers_9_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_9_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(231341376))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(234487168))), name = tensor<string, []>("layers_9_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3918_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_3918_dilations_0, groups = var_3918_groups_0, pad = var_3918_pad_0, pad_type = var_3918_pad_type_0, strides = var_3918_strides_0, weight = layers_9_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_275_cast_fp16)[name = tensor<string, []>("op_3918_cast_fp16")];
             tensor<string, []> var_3924_pad_type_0 = const()[name = tensor<string, []>("op_3924_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_3924_strides_0 = const()[name = tensor<string, []>("op_3924_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_3924_pad_0 = const()[name = tensor<string, []>("op_3924_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_3924_dilations_0 = const()[name = tensor<string, []>("op_3924_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_3924_groups_0 = const()[name = tensor<string, []>("op_3924_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_9_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(170654848))), name = tensor<string, []>("layers_9_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [68811]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(170517120))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_9_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(234625088))), name = tensor<string, []>("layers_9_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [68811]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(234487360))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_3924_cast_fp16 = conv(dilations = var_3924_dilations_0, groups = var_3924_groups_0, pad = var_3924_pad_0, pad_type = var_3924_pad_type_0, strides = var_3924_strides_0, weight = layers_9_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_275_cast_fp16)[name = tensor<string, []>("op_3924_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_61_cast_fp16 = add(x = var_3918_cast_fp16, y = var_3924_cast_fp16)[name = tensor<string, []>("x_61_cast_fp16")];
             tensor<fp16, []> var_3926_to_fp16 = const()[name = tensor<string, []>("op_3926_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -2787,16 +2787,16 @@ program(1.0)
             tensor<int32, [1]> out_99_axes_0 = const()[name = tensor<string, []>("out_99_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3937_to_fp16 = const()[name = tensor<string, []>("op_3937_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_99_cast_fp16 = layer_norm(axes = out_99_axes_0, epsilon = var_3937_to_fp16, x = inputs_99_cast_fp16)[name = tensor<string, []>("out_99_cast_fp16")];
-            tensor<fp16, [1024]> inputs_101_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_101_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(171179200)))];
-            tensor<fp16, [1024]> inputs_101_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_101_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(171181312)))];
+            tensor<fp16, [1024]> inputs_101_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_101_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(235149440)))];
+            tensor<fp16, [1024]> inputs_101_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_101_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(235151552)))];
             tensor<fp16, []> inputs_101_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_101_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_101_cast_fp16 = batch_norm(beta = inputs_101_beta_0_to_fp16, epsilon = inputs_101_epsilon_0_to_fp16, gamma = inputs_101_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_99_cast_fp16)[name = tensor<string, []>("inputs_101_cast_fp16")];
             tensor<int32, []> var_3951 = const()[name = tensor<string, []>("op_3951"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_101_axes_0 = const()[name = tensor<string, []>("out_101_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_3982_to_fp16 = const()[name = tensor<string, []>("op_3982_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_101_cast_fp16 = layer_norm(axes = out_101_axes_0, epsilon = var_3982_to_fp16, x = inputs_101_cast_fp16)[name = tensor<string, []>("out_101_cast_fp16")];
-            tensor<fp16, [1024]> input_277_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_277_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(171183424)))];
-            tensor<fp16, [1024]> input_277_beta_0_to_fp16 = const()[name = tensor<string, []>("input_277_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(171185536)))];
+            tensor<fp16, [1024]> input_277_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_277_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(235153664)))];
+            tensor<fp16, [1024]> input_277_beta_0_to_fp16 = const()[name = tensor<string, []>("input_277_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(235155776)))];
             tensor<fp16, []> input_277_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_277_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_277_cast_fp16 = batch_norm(beta = input_277_beta_0_to_fp16, epsilon = input_277_epsilon_0_to_fp16, gamma = input_277_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_101_cast_fp16)[name = tensor<string, []>("input_277_cast_fp16")];
             tensor<string, []> var_4002_pad_type_0 = const()[name = tensor<string, []>("op_4002_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2804,14 +2804,14 @@ program(1.0)
             tensor<int32, [4]> var_4002_pad_0 = const()[name = tensor<string, []>("op_4002_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4002_dilations_0 = const()[name = tensor<string, []>("op_4002_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4002_groups_0 = const()[name = tensor<string, []>("op_4002_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_10_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(171187648))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173284864))), name = tensor<string, []>("layers_10_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_10_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(235157888))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(238303680))), name = tensor<string, []>("layers_10_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4002_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_4002_dilations_0, groups = var_4002_groups_0, pad = var_4002_pad_0, pad_type = var_4002_pad_type_0, strides = var_4002_strides_0, weight = layers_10_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_277_cast_fp16)[name = tensor<string, []>("op_4002_cast_fp16")];
             tensor<string, []> var_4008_pad_type_0 = const()[name = tensor<string, []>("op_4008_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4008_strides_0 = const()[name = tensor<string, []>("op_4008_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4008_pad_0 = const()[name = tensor<string, []>("op_4008_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4008_dilations_0 = const()[name = tensor<string, []>("op_4008_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4008_groups_0 = const()[name = tensor<string, []>("op_4008_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_10_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173457536))), name = tensor<string, []>("layers_10_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [86233]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173284992))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_10_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(238476416))), name = tensor<string, []>("layers_10_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [86233]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(238303872))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4008_cast_fp16 = conv(dilations = var_4008_dilations_0, groups = var_4008_groups_0, pad = var_4008_pad_0, pad_type = var_4008_pad_type_0, strides = var_4008_strides_0, weight = layers_10_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_277_cast_fp16)[name = tensor<string, []>("op_4008_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_279_cast_fp16 = add(x = var_4002_cast_fp16, y = var_4008_cast_fp16)[name = tensor<string, []>("input_279_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_281_cast_fp16 = silu(x = input_279_cast_fp16)[name = tensor<string, []>("input_281_cast_fp16")];
@@ -2820,14 +2820,14 @@ program(1.0)
             tensor<int32, [4]> var_4019_pad_0 = const()[name = tensor<string, []>("op_4019_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4019_dilations_0 = const()[name = tensor<string, []>("op_4019_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4019_groups_0 = const()[name = tensor<string, []>("op_4019_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_10_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(173981888))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176079104))), name = tensor<string, []>("layers_10_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_10_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(239000768))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(242146560))), name = tensor<string, []>("layers_10_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4019_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4019_dilations_0, groups = var_4019_groups_0, pad = var_4019_pad_0, pad_type = var_4019_pad_type_0, strides = var_4019_strides_0, weight = layers_10_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_281_cast_fp16)[name = tensor<string, []>("op_4019_cast_fp16")];
             tensor<string, []> var_4025_pad_type_0 = const()[name = tensor<string, []>("op_4025_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4025_strides_0 = const()[name = tensor<string, []>("op_4025_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4025_pad_0 = const()[name = tensor<string, []>("op_4025_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4025_dilations_0 = const()[name = tensor<string, []>("op_4025_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4025_groups_0 = const()[name = tensor<string, []>("op_4025_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_10_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176232512))), name = tensor<string, []>("layers_10_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [76584]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176079232))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_10_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(242300032))), name = tensor<string, []>("layers_10_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [76584]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(242146752))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4025_cast_fp16 = conv(dilations = var_4025_dilations_0, groups = var_4025_groups_0, pad = var_4025_pad_0, pad_type = var_4025_pad_type_0, strides = var_4025_strides_0, weight = layers_10_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_281_cast_fp16)[name = tensor<string, []>("op_4025_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_63_cast_fp16 = add(x = var_4019_cast_fp16, y = var_4025_cast_fp16)[name = tensor<string, []>("x_63_cast_fp16")];
             tensor<fp16, []> var_4027_to_fp16 = const()[name = tensor<string, []>("op_4027_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -2836,8 +2836,8 @@ program(1.0)
             tensor<int32, [1]> out_103_axes_0 = const()[name = tensor<string, []>("out_103_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4038_to_fp16 = const()[name = tensor<string, []>("op_4038_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_103_cast_fp16 = layer_norm(axes = out_103_axes_0, epsilon = var_4038_to_fp16, x = inputs_103_cast_fp16)[name = tensor<string, []>("out_103_cast_fp16")];
-            tensor<fp16, [1024]> obj_43_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_43_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176756864)))];
-            tensor<fp16, [1024]> obj_43_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_43_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176758976)))];
+            tensor<fp16, [1024]> obj_43_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_43_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(242824384)))];
+            tensor<fp16, [1024]> obj_43_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_43_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(242826496)))];
             tensor<fp16, []> obj_43_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_43_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_43_cast_fp16 = batch_norm(beta = obj_43_beta_0_to_fp16, epsilon = obj_43_epsilon_0_to_fp16, gamma = obj_43_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_103_cast_fp16)[name = tensor<string, []>("obj_43_cast_fp16")];
             tensor<string, []> var_4063_pad_type_0 = const()[name = tensor<string, []>("op_4063_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2845,14 +2845,14 @@ program(1.0)
             tensor<int32, [4]> var_4063_pad_0 = const()[name = tensor<string, []>("op_4063_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4063_dilations_0 = const()[name = tensor<string, []>("op_4063_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4063_groups_0 = const()[name = tensor<string, []>("op_4063_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(176761088))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177285440))), name = tensor<string, []>("layers_10_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(242828608))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(243615104))), name = tensor<string, []>("layers_10_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4063_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4063_dilations_0, groups = var_4063_groups_0, pad = var_4063_pad_0, pad_type = var_4063_pad_type_0, strides = var_4063_strides_0, weight = layers_10_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_43_cast_fp16)[name = tensor<string, []>("op_4063_cast_fp16")];
             tensor<string, []> var_4069_pad_type_0 = const()[name = tensor<string, []>("op_4069_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4069_strides_0 = const()[name = tensor<string, []>("op_4069_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4069_pad_0 = const()[name = tensor<string, []>("op_4069_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4069_dilations_0 = const()[name = tensor<string, []>("op_4069_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4069_groups_0 = const()[name = tensor<string, []>("op_4069_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177321600))), name = tensor<string, []>("layers_10_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17959]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177285568))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(243651328))), name = tensor<string, []>("layers_10_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17959]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(243615296))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4069_cast_fp16 = conv(dilations = var_4069_dilations_0, groups = var_4069_groups_0, pad = var_4069_pad_0, pad_type = var_4069_pad_type_0, strides = var_4069_strides_0, weight = layers_10_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_43_cast_fp16)[name = tensor<string, []>("op_4069_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_41_cast_fp16 = add(x = var_4063_cast_fp16, y = var_4069_cast_fp16)[name = tensor<string, []>("query_41_cast_fp16")];
             tensor<string, []> var_4078_pad_type_0 = const()[name = tensor<string, []>("op_4078_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2860,14 +2860,14 @@ program(1.0)
             tensor<int32, [4]> var_4078_pad_0 = const()[name = tensor<string, []>("op_4078_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4078_dilations_0 = const()[name = tensor<string, []>("op_4078_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4078_groups_0 = const()[name = tensor<string, []>("op_4078_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177452736))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177977088))), name = tensor<string, []>("layers_10_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(243782464))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244568960))), name = tensor<string, []>("layers_10_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4078_cast_fp16 = conv(dilations = var_4078_dilations_0, groups = var_4078_groups_0, pad = var_4078_pad_0, pad_type = var_4078_pad_type_0, strides = var_4078_strides_0, weight = layers_10_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_43_cast_fp16)[name = tensor<string, []>("op_4078_cast_fp16")];
             tensor<string, []> var_4084_pad_type_0 = const()[name = tensor<string, []>("op_4084_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4084_strides_0 = const()[name = tensor<string, []>("op_4084_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4084_pad_0 = const()[name = tensor<string, []>("op_4084_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4084_dilations_0 = const()[name = tensor<string, []>("op_4084_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4084_groups_0 = const()[name = tensor<string, []>("op_4084_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178020160))), name = tensor<string, []>("layers_10_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [21436]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(177977216))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244612096))), name = tensor<string, []>("layers_10_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [21436]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244569152))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4084_cast_fp16 = conv(dilations = var_4084_dilations_0, groups = var_4084_groups_0, pad = var_4084_pad_0, pad_type = var_4084_pad_type_0, strides = var_4084_strides_0, weight = layers_10_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_43_cast_fp16)[name = tensor<string, []>("op_4084_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_21_cast_fp16 = add(x = var_4078_cast_fp16, y = var_4084_cast_fp16)[name = tensor<string, []>("key_21_cast_fp16")];
             tensor<string, []> var_4094_pad_type_0 = const()[name = tensor<string, []>("op_4094_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2875,33 +2875,33 @@ program(1.0)
             tensor<int32, [4]> var_4094_pad_0 = const()[name = tensor<string, []>("op_4094_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4094_dilations_0 = const()[name = tensor<string, []>("op_4094_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4094_groups_0 = const()[name = tensor<string, []>("op_4094_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178151296))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178675648))), name = tensor<string, []>("layers_10_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244743232))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245529728))), name = tensor<string, []>("layers_10_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4094_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4094_dilations_0, groups = var_4094_groups_0, pad = var_4094_pad_0, pad_type = var_4094_pad_type_0, strides = var_4094_strides_0, weight = layers_10_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_43_cast_fp16)[name = tensor<string, []>("op_4094_cast_fp16")];
             tensor<string, []> var_4100_pad_type_0 = const()[name = tensor<string, []>("op_4100_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4100_strides_0 = const()[name = tensor<string, []>("op_4100_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4100_pad_0 = const()[name = tensor<string, []>("op_4100_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4100_dilations_0 = const()[name = tensor<string, []>("op_4100_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4100_groups_0 = const()[name = tensor<string, []>("op_4100_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178706944))), name = tensor<string, []>("layers_10_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15532]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178675776))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245561088))), name = tensor<string, []>("layers_10_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15532]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245529920))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4100_cast_fp16 = conv(dilations = var_4100_dilations_0, groups = var_4100_groups_0, pad = var_4100_pad_0, pad_type = var_4100_pad_type_0, strides = var_4100_strides_0, weight = layers_10_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_43_cast_fp16)[name = tensor<string, []>("op_4100_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_21_cast_fp16 = add(x = var_4094_cast_fp16, y = var_4100_cast_fp16)[name = tensor<string, []>("value_21_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_4103_to_fp16 = const()[name = tensor<string, []>("op_4103_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178838080)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_4103_to_fp16 = const()[name = tensor<string, []>("op_4103_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245692224)))];
             tensor<fp16, [1, 1024, 1, 188]> query_43_cast_fp16 = add(x = query_41_cast_fp16, y = var_4103_to_fp16)[name = tensor<string, []>("query_43_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_4106_to_fp16 = const()[name = tensor<string, []>("op_4106_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178840192)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_4106_to_fp16 = const()[name = tensor<string, []>("op_4106_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245694336)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_21_cast_fp16 = add(x = query_41_cast_fp16, y = var_4106_to_fp16)[name = tensor<string, []>("q_with_bias_v_21_cast_fp16")];
             tensor<string, []> var_4116_pad_type_0 = const()[name = tensor<string, []>("op_4116_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4116_strides_0 = const()[name = tensor<string, []>("op_4116_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4116_pad_0 = const()[name = tensor<string, []>("op_4116_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4116_dilations_0 = const()[name = tensor<string, []>("op_4116_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4116_groups_0 = const()[name = tensor<string, []>("op_4116_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(178842304))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(179366656))), name = tensor<string, []>("layers_10_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245696448))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246482944))), name = tensor<string, []>("layers_10_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_4116_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4116_dilations_0, groups = var_4116_groups_0, pad = var_4116_pad_0, pad_type = var_4116_pad_type_0, strides = var_4116_strides_0, weight = layers_10_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_4116_cast_fp16")];
             tensor<string, []> var_4122_pad_type_0 = const()[name = tensor<string, []>("op_4122_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4122_strides_0 = const()[name = tensor<string, []>("op_4122_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4122_pad_0 = const()[name = tensor<string, []>("op_4122_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4122_dilations_0 = const()[name = tensor<string, []>("op_4122_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4122_groups_0 = const()[name = tensor<string, []>("op_4122_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(179422784))), name = tensor<string, []>("layers_10_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [27964]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(179366784))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246539136))), name = tensor<string, []>("layers_10_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [27964]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246483136))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_4122_cast_fp16 = conv(dilations = var_4122_dilations_0, groups = var_4122_groups_0, pad = var_4122_pad_0, pad_type = var_4122_pad_type_0, strides = var_4122_strides_0, weight = layers_10_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_4122_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_21_cast_fp16 = add(x = var_4116_cast_fp16, y = var_4122_cast_fp16)[name = tensor<string, []>("p_21_cast_fp16")];
             tensor<int32, [4]> var_4126 = const()[name = tensor<string, []>("op_4126"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -2952,22 +2952,22 @@ program(1.0)
             tensor<int32, [4]> var_4179_pad_0 = const()[name = tensor<string, []>("op_4179_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4179_dilations_0 = const()[name = tensor<string, []>("op_4179_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4179_groups_0 = const()[name = tensor<string, []>("op_4179_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(179553920))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180078272))), name = tensor<string, []>("layers_10_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246670272))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247456768))), name = tensor<string, []>("layers_10_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4179_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4179_dilations_0, groups = var_4179_groups_0, pad = var_4179_pad_0, pad_type = var_4179_pad_type_0, strides = var_4179_strides_0, weight = layers_10_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_283_cast_fp16)[name = tensor<string, []>("op_4179_cast_fp16")];
             tensor<string, []> var_4185_pad_type_0 = const()[name = tensor<string, []>("op_4185_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4185_strides_0 = const()[name = tensor<string, []>("op_4185_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4185_pad_0 = const()[name = tensor<string, []>("op_4185_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4185_dilations_0 = const()[name = tensor<string, []>("op_4185_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4185_groups_0 = const()[name = tensor<string, []>("op_4185_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180110464))), name = tensor<string, []>("layers_10_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15971]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180078400))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247489024))), name = tensor<string, []>("layers_10_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15971]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247456960))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4185_cast_fp16 = conv(dilations = var_4185_dilations_0, groups = var_4185_groups_0, pad = var_4185_pad_0, pad_type = var_4185_pad_type_0, strides = var_4185_strides_0, weight = layers_10_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_283_cast_fp16)[name = tensor<string, []>("op_4185_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_45_cast_fp16 = add(x = var_4179_cast_fp16, y = var_4185_cast_fp16)[name = tensor<string, []>("obj_45_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_105_cast_fp16 = add(x = inputs_103_cast_fp16, y = obj_45_cast_fp16)[name = tensor<string, []>("inputs_105_cast_fp16")];
             tensor<int32, [1]> out_105_axes_0 = const()[name = tensor<string, []>("out_105_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4196_to_fp16 = const()[name = tensor<string, []>("op_4196_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_105_cast_fp16 = layer_norm(axes = out_105_axes_0, epsilon = var_4196_to_fp16, x = inputs_105_cast_fp16)[name = tensor<string, []>("out_105_cast_fp16")];
-            tensor<fp16, [1024]> input_285_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_285_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180241600)))];
-            tensor<fp16, [1024]> input_285_beta_0_to_fp16 = const()[name = tensor<string, []>("input_285_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180243712)))];
+            tensor<fp16, [1024]> input_285_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_285_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247620160)))];
+            tensor<fp16, [1024]> input_285_beta_0_to_fp16 = const()[name = tensor<string, []>("input_285_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247622272)))];
             tensor<fp16, []> input_285_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_285_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_285_cast_fp16 = batch_norm(beta = input_285_beta_0_to_fp16, epsilon = input_285_epsilon_0_to_fp16, gamma = input_285_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_105_cast_fp16)[name = tensor<string, []>("input_285_cast_fp16")];
             tensor<string, []> var_4217_pad_type_0 = const()[name = tensor<string, []>("op_4217_pad_type_0"), val = tensor<string, []>("valid")];
@@ -2975,14 +2975,14 @@ program(1.0)
             tensor<int32, [4]> var_4217_pad_0 = const()[name = tensor<string, []>("op_4217_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4217_dilations_0 = const()[name = tensor<string, []>("op_4217_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4217_groups_0 = const()[name = tensor<string, []>("op_4217_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_10_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(180245824))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181294464))), name = tensor<string, []>("layers_10_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_10_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247624384))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249197312))), name = tensor<string, []>("layers_10_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_4217_cast_fp16 = conv(dilations = var_4217_dilations_0, groups = var_4217_groups_0, pad = var_4217_pad_0, pad_type = var_4217_pad_type_0, strides = var_4217_strides_0, weight = layers_10_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_285_cast_fp16)[name = tensor<string, []>("op_4217_cast_fp16")];
             tensor<string, []> var_4223_pad_type_0 = const()[name = tensor<string, []>("op_4223_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4223_strides_0 = const()[name = tensor<string, []>("op_4223_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4223_pad_0 = const()[name = tensor<string, []>("op_4223_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4223_dilations_0 = const()[name = tensor<string, []>("op_4223_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4223_groups_0 = const()[name = tensor<string, []>("op_4223_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_10_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181354944))), name = tensor<string, []>("layers_10_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [30138]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181294592))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_10_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249257856))), name = tensor<string, []>("layers_10_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [30138]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249197504))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_4223_cast_fp16 = conv(dilations = var_4223_dilations_0, groups = var_4223_groups_0, pad = var_4223_pad_0, pad_type = var_4223_pad_type_0, strides = var_4223_strides_0, weight = layers_10_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_285_cast_fp16)[name = tensor<string, []>("op_4223_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_287_cast_fp16 = add(x = var_4217_cast_fp16, y = var_4223_cast_fp16)[name = tensor<string, []>("input_287_cast_fp16")];
             tensor<int32, []> input_289_split_num_splits_0 = const()[name = tensor<string, []>("input_289_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -2995,8 +2995,8 @@ program(1.0)
             tensor<int32, []> input_291_groups_0 = const()[name = tensor<string, []>("input_291_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_291_strides_0 = const()[name = tensor<string, []>("input_291_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_291_dilations_0 = const()[name = tensor<string, []>("input_291_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_288_to_fp16 = const()[name = tensor<string, []>("const_288_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181617152)))];
-            tensor<fp16, [1024]> const_289_to_fp16 = const()[name = tensor<string, []>("const_289_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181635648)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_288_to_fp16 = const()[name = tensor<string, []>("const_288_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249520064)))];
+            tensor<fp16, [1024]> const_289_to_fp16 = const()[name = tensor<string, []>("const_289_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249538560)))];
             tensor<fp16, [1, 1024, 1, 188]> input_293_cast_fp16 = conv(bias = const_289_to_fp16, dilations = input_291_dilations_0, groups = input_291_groups_0, pad = input_291_pad_0, pad_type = input_291_pad_type_0, strides = input_291_strides_0, weight = const_288_to_fp16, x = input_289_cast_fp16)[name = tensor<string, []>("input_293_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_295_cast_fp16 = silu(x = input_293_cast_fp16)[name = tensor<string, []>("input_295_cast_fp16")];
             tensor<string, []> var_4245_pad_type_0 = const()[name = tensor<string, []>("op_4245_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3004,22 +3004,22 @@ program(1.0)
             tensor<int32, [4]> var_4245_pad_0 = const()[name = tensor<string, []>("op_4245_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4245_dilations_0 = const()[name = tensor<string, []>("op_4245_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4245_groups_0 = const()[name = tensor<string, []>("op_4245_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(181637760))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(182162112))), name = tensor<string, []>("layers_10_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249540672))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(250327168))), name = tensor<string, []>("layers_10_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4245_cast_fp16 = conv(dilations = var_4245_dilations_0, groups = var_4245_groups_0, pad = var_4245_pad_0, pad_type = var_4245_pad_type_0, strides = var_4245_strides_0, weight = layers_10_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_295_cast_fp16)[name = tensor<string, []>("op_4245_cast_fp16")];
             tensor<string, []> var_4251_pad_type_0 = const()[name = tensor<string, []>("op_4251_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4251_strides_0 = const()[name = tensor<string, []>("op_4251_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4251_pad_0 = const()[name = tensor<string, []>("op_4251_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4251_dilations_0 = const()[name = tensor<string, []>("op_4251_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4251_groups_0 = const()[name = tensor<string, []>("op_4251_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_10_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(182193600))), name = tensor<string, []>("layers_10_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15623]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(182162240))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_10_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(250358720))), name = tensor<string, []>("layers_10_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15623]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(250327360))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4251_cast_fp16 = conv(dilations = var_4251_dilations_0, groups = var_4251_groups_0, pad = var_4251_pad_0, pad_type = var_4251_pad_type_0, strides = var_4251_strides_0, weight = layers_10_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_295_cast_fp16)[name = tensor<string, []>("op_4251_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_65_cast_fp16 = add(x = var_4245_cast_fp16, y = var_4251_cast_fp16)[name = tensor<string, []>("x_65_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_107_cast_fp16 = add(x = inputs_105_cast_fp16, y = x_65_cast_fp16)[name = tensor<string, []>("inputs_107_cast_fp16")];
             tensor<int32, [1]> out_107_axes_0 = const()[name = tensor<string, []>("out_107_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4262_to_fp16 = const()[name = tensor<string, []>("op_4262_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_107_cast_fp16 = layer_norm(axes = out_107_axes_0, epsilon = var_4262_to_fp16, x = inputs_107_cast_fp16)[name = tensor<string, []>("out_107_cast_fp16")];
-            tensor<fp16, [1024]> input_297_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_297_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(182324736)))];
-            tensor<fp16, [1024]> input_297_beta_0_to_fp16 = const()[name = tensor<string, []>("input_297_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(182326848)))];
+            tensor<fp16, [1024]> input_297_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_297_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(250489856)))];
+            tensor<fp16, [1024]> input_297_beta_0_to_fp16 = const()[name = tensor<string, []>("input_297_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(250491968)))];
             tensor<fp16, []> input_297_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_297_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_297_cast_fp16 = batch_norm(beta = input_297_beta_0_to_fp16, epsilon = input_297_epsilon_0_to_fp16, gamma = input_297_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_107_cast_fp16)[name = tensor<string, []>("input_297_cast_fp16")];
             tensor<string, []> var_4282_pad_type_0 = const()[name = tensor<string, []>("op_4282_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3027,14 +3027,14 @@ program(1.0)
             tensor<int32, [4]> var_4282_pad_0 = const()[name = tensor<string, []>("op_4282_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4282_dilations_0 = const()[name = tensor<string, []>("op_4282_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4282_groups_0 = const()[name = tensor<string, []>("op_4282_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_10_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(182328960))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(184426176))), name = tensor<string, []>("layers_10_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_10_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(250494080))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(253639872))), name = tensor<string, []>("layers_10_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4282_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_4282_dilations_0, groups = var_4282_groups_0, pad = var_4282_pad_0, pad_type = var_4282_pad_type_0, strides = var_4282_strides_0, weight = layers_10_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_297_cast_fp16)[name = tensor<string, []>("op_4282_cast_fp16")];
             tensor<string, []> var_4288_pad_type_0 = const()[name = tensor<string, []>("op_4288_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4288_strides_0 = const()[name = tensor<string, []>("op_4288_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4288_pad_0 = const()[name = tensor<string, []>("op_4288_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4288_dilations_0 = const()[name = tensor<string, []>("op_4288_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4288_groups_0 = const()[name = tensor<string, []>("op_4288_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_10_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(184582976))), name = tensor<string, []>("layers_10_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [78299]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(184426304))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_10_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(253796736))), name = tensor<string, []>("layers_10_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [78299]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(253640064))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4288_cast_fp16 = conv(dilations = var_4288_dilations_0, groups = var_4288_groups_0, pad = var_4288_pad_0, pad_type = var_4288_pad_type_0, strides = var_4288_strides_0, weight = layers_10_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_297_cast_fp16)[name = tensor<string, []>("op_4288_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_299_cast_fp16 = add(x = var_4282_cast_fp16, y = var_4288_cast_fp16)[name = tensor<string, []>("input_299_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_301_cast_fp16 = silu(x = input_299_cast_fp16)[name = tensor<string, []>("input_301_cast_fp16")];
@@ -3043,14 +3043,14 @@ program(1.0)
             tensor<int32, [4]> var_4299_pad_0 = const()[name = tensor<string, []>("op_4299_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4299_dilations_0 = const()[name = tensor<string, []>("op_4299_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4299_groups_0 = const()[name = tensor<string, []>("op_4299_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_10_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(185107328))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(187204544))), name = tensor<string, []>("layers_10_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_10_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(254321088))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(257466880))), name = tensor<string, []>("layers_10_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4299_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4299_dilations_0, groups = var_4299_groups_0, pad = var_4299_pad_0, pad_type = var_4299_pad_type_0, strides = var_4299_strides_0, weight = layers_10_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_301_cast_fp16)[name = tensor<string, []>("op_4299_cast_fp16")];
             tensor<string, []> var_4305_pad_type_0 = const()[name = tensor<string, []>("op_4305_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4305_strides_0 = const()[name = tensor<string, []>("op_4305_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4305_pad_0 = const()[name = tensor<string, []>("op_4305_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4305_dilations_0 = const()[name = tensor<string, []>("op_4305_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4305_groups_0 = const()[name = tensor<string, []>("op_4305_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_10_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(187354624))), name = tensor<string, []>("layers_10_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [74933]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(187204672))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_10_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(257617024))), name = tensor<string, []>("layers_10_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [74933]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(257467072))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4305_cast_fp16 = conv(dilations = var_4305_dilations_0, groups = var_4305_groups_0, pad = var_4305_pad_0, pad_type = var_4305_pad_type_0, strides = var_4305_strides_0, weight = layers_10_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_301_cast_fp16)[name = tensor<string, []>("op_4305_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_67_cast_fp16 = add(x = var_4299_cast_fp16, y = var_4305_cast_fp16)[name = tensor<string, []>("x_67_cast_fp16")];
             tensor<fp16, []> var_4307_to_fp16 = const()[name = tensor<string, []>("op_4307_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -3059,16 +3059,16 @@ program(1.0)
             tensor<int32, [1]> out_109_axes_0 = const()[name = tensor<string, []>("out_109_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4318_to_fp16 = const()[name = tensor<string, []>("op_4318_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_109_cast_fp16 = layer_norm(axes = out_109_axes_0, epsilon = var_4318_to_fp16, x = inputs_109_cast_fp16)[name = tensor<string, []>("out_109_cast_fp16")];
-            tensor<fp16, [1024]> inputs_111_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_111_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(187878976)))];
-            tensor<fp16, [1024]> inputs_111_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_111_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(187881088)))];
+            tensor<fp16, [1024]> inputs_111_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_111_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(258141376)))];
+            tensor<fp16, [1024]> inputs_111_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_111_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(258143488)))];
             tensor<fp16, []> inputs_111_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_111_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_111_cast_fp16 = batch_norm(beta = inputs_111_beta_0_to_fp16, epsilon = inputs_111_epsilon_0_to_fp16, gamma = inputs_111_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_109_cast_fp16)[name = tensor<string, []>("inputs_111_cast_fp16")];
             tensor<int32, []> var_4332 = const()[name = tensor<string, []>("op_4332"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_111_axes_0 = const()[name = tensor<string, []>("out_111_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4363_to_fp16 = const()[name = tensor<string, []>("op_4363_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_111_cast_fp16 = layer_norm(axes = out_111_axes_0, epsilon = var_4363_to_fp16, x = inputs_111_cast_fp16)[name = tensor<string, []>("out_111_cast_fp16")];
-            tensor<fp16, [1024]> input_303_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_303_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(187883200)))];
-            tensor<fp16, [1024]> input_303_beta_0_to_fp16 = const()[name = tensor<string, []>("input_303_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(187885312)))];
+            tensor<fp16, [1024]> input_303_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_303_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(258145600)))];
+            tensor<fp16, [1024]> input_303_beta_0_to_fp16 = const()[name = tensor<string, []>("input_303_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(258147712)))];
             tensor<fp16, []> input_303_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_303_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_303_cast_fp16 = batch_norm(beta = input_303_beta_0_to_fp16, epsilon = input_303_epsilon_0_to_fp16, gamma = input_303_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_111_cast_fp16)[name = tensor<string, []>("input_303_cast_fp16")];
             tensor<string, []> var_4383_pad_type_0 = const()[name = tensor<string, []>("op_4383_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3076,14 +3076,14 @@ program(1.0)
             tensor<int32, [4]> var_4383_pad_0 = const()[name = tensor<string, []>("op_4383_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4383_dilations_0 = const()[name = tensor<string, []>("op_4383_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4383_groups_0 = const()[name = tensor<string, []>("op_4383_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_11_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(187887424))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(189984640))), name = tensor<string, []>("layers_11_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_11_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(258149824))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(261295616))), name = tensor<string, []>("layers_11_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4383_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_4383_dilations_0, groups = var_4383_groups_0, pad = var_4383_pad_0, pad_type = var_4383_pad_type_0, strides = var_4383_strides_0, weight = layers_11_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_303_cast_fp16)[name = tensor<string, []>("op_4383_cast_fp16")];
             tensor<string, []> var_4389_pad_type_0 = const()[name = tensor<string, []>("op_4389_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4389_strides_0 = const()[name = tensor<string, []>("op_4389_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4389_pad_0 = const()[name = tensor<string, []>("op_4389_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4389_dilations_0 = const()[name = tensor<string, []>("op_4389_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4389_groups_0 = const()[name = tensor<string, []>("op_4389_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_11_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(190153600))), name = tensor<string, []>("layers_11_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [84368]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(189984768))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_11_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(261464640))), name = tensor<string, []>("layers_11_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [84368]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(261295808))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4389_cast_fp16 = conv(dilations = var_4389_dilations_0, groups = var_4389_groups_0, pad = var_4389_pad_0, pad_type = var_4389_pad_type_0, strides = var_4389_strides_0, weight = layers_11_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_303_cast_fp16)[name = tensor<string, []>("op_4389_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_305_cast_fp16 = add(x = var_4383_cast_fp16, y = var_4389_cast_fp16)[name = tensor<string, []>("input_305_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_307_cast_fp16 = silu(x = input_305_cast_fp16)[name = tensor<string, []>("input_307_cast_fp16")];
@@ -3092,14 +3092,14 @@ program(1.0)
             tensor<int32, [4]> var_4400_pad_0 = const()[name = tensor<string, []>("op_4400_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4400_dilations_0 = const()[name = tensor<string, []>("op_4400_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4400_groups_0 = const()[name = tensor<string, []>("op_4400_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_11_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(190677952))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(192775168))), name = tensor<string, []>("layers_11_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_11_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(261988992))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265134784))), name = tensor<string, []>("layers_11_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4400_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4400_dilations_0, groups = var_4400_groups_0, pad = var_4400_pad_0, pad_type = var_4400_pad_type_0, strides = var_4400_strides_0, weight = layers_11_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_307_cast_fp16)[name = tensor<string, []>("op_4400_cast_fp16")];
             tensor<string, []> var_4406_pad_type_0 = const()[name = tensor<string, []>("op_4406_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4406_strides_0 = const()[name = tensor<string, []>("op_4406_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4406_pad_0 = const()[name = tensor<string, []>("op_4406_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4406_dilations_0 = const()[name = tensor<string, []>("op_4406_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4406_groups_0 = const()[name = tensor<string, []>("op_4406_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_11_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(192926656))), name = tensor<string, []>("layers_11_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75621]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(192775296))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_11_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265286336))), name = tensor<string, []>("layers_11_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75621]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265134976))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4406_cast_fp16 = conv(dilations = var_4406_dilations_0, groups = var_4406_groups_0, pad = var_4406_pad_0, pad_type = var_4406_pad_type_0, strides = var_4406_strides_0, weight = layers_11_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_307_cast_fp16)[name = tensor<string, []>("op_4406_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_69_cast_fp16 = add(x = var_4400_cast_fp16, y = var_4406_cast_fp16)[name = tensor<string, []>("x_69_cast_fp16")];
             tensor<fp16, []> var_4408_to_fp16 = const()[name = tensor<string, []>("op_4408_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -3108,8 +3108,8 @@ program(1.0)
             tensor<int32, [1]> out_113_axes_0 = const()[name = tensor<string, []>("out_113_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4419_to_fp16 = const()[name = tensor<string, []>("op_4419_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_113_cast_fp16 = layer_norm(axes = out_113_axes_0, epsilon = var_4419_to_fp16, x = inputs_113_cast_fp16)[name = tensor<string, []>("out_113_cast_fp16")];
-            tensor<fp16, [1024]> obj_47_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_47_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(193451008)))];
-            tensor<fp16, [1024]> obj_47_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_47_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(193453120)))];
+            tensor<fp16, [1024]> obj_47_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_47_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265810688)))];
+            tensor<fp16, [1024]> obj_47_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_47_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265812800)))];
             tensor<fp16, []> obj_47_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_47_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_47_cast_fp16 = batch_norm(beta = obj_47_beta_0_to_fp16, epsilon = obj_47_epsilon_0_to_fp16, gamma = obj_47_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_113_cast_fp16)[name = tensor<string, []>("obj_47_cast_fp16")];
             tensor<string, []> var_4444_pad_type_0 = const()[name = tensor<string, []>("op_4444_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3117,14 +3117,14 @@ program(1.0)
             tensor<int32, [4]> var_4444_pad_0 = const()[name = tensor<string, []>("op_4444_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4444_dilations_0 = const()[name = tensor<string, []>("op_4444_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4444_groups_0 = const()[name = tensor<string, []>("op_4444_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(193455232))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(193979584))), name = tensor<string, []>("layers_11_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265814912))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(266601408))), name = tensor<string, []>("layers_11_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4444_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4444_dilations_0, groups = var_4444_groups_0, pad = var_4444_pad_0, pad_type = var_4444_pad_type_0, strides = var_4444_strides_0, weight = layers_11_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_47_cast_fp16)[name = tensor<string, []>("op_4444_cast_fp16")];
             tensor<string, []> var_4450_pad_type_0 = const()[name = tensor<string, []>("op_4450_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4450_strides_0 = const()[name = tensor<string, []>("op_4450_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4450_pad_0 = const()[name = tensor<string, []>("op_4450_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4450_dilations_0 = const()[name = tensor<string, []>("op_4450_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4450_groups_0 = const()[name = tensor<string, []>("op_4450_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(194010752))), name = tensor<string, []>("layers_11_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15465]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(193979712))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(266632640))), name = tensor<string, []>("layers_11_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15465]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(266601600))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4450_cast_fp16 = conv(dilations = var_4450_dilations_0, groups = var_4450_groups_0, pad = var_4450_pad_0, pad_type = var_4450_pad_type_0, strides = var_4450_strides_0, weight = layers_11_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_47_cast_fp16)[name = tensor<string, []>("op_4450_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_45_cast_fp16 = add(x = var_4444_cast_fp16, y = var_4450_cast_fp16)[name = tensor<string, []>("query_45_cast_fp16")];
             tensor<string, []> var_4459_pad_type_0 = const()[name = tensor<string, []>("op_4459_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3132,14 +3132,14 @@ program(1.0)
             tensor<int32, [4]> var_4459_pad_0 = const()[name = tensor<string, []>("op_4459_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4459_dilations_0 = const()[name = tensor<string, []>("op_4459_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4459_groups_0 = const()[name = tensor<string, []>("op_4459_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(194141888))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(194666240))), name = tensor<string, []>("layers_11_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(266763776))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(267550272))), name = tensor<string, []>("layers_11_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4459_cast_fp16 = conv(dilations = var_4459_dilations_0, groups = var_4459_groups_0, pad = var_4459_pad_0, pad_type = var_4459_pad_type_0, strides = var_4459_strides_0, weight = layers_11_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_47_cast_fp16)[name = tensor<string, []>("op_4459_cast_fp16")];
             tensor<string, []> var_4465_pad_type_0 = const()[name = tensor<string, []>("op_4465_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4465_strides_0 = const()[name = tensor<string, []>("op_4465_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4465_pad_0 = const()[name = tensor<string, []>("op_4465_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4465_dilations_0 = const()[name = tensor<string, []>("op_4465_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4465_groups_0 = const()[name = tensor<string, []>("op_4465_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(194700672))), name = tensor<string, []>("layers_11_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17119]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(194666368))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(267584768))), name = tensor<string, []>("layers_11_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17119]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(267550464))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4465_cast_fp16 = conv(dilations = var_4465_dilations_0, groups = var_4465_groups_0, pad = var_4465_pad_0, pad_type = var_4465_pad_type_0, strides = var_4465_strides_0, weight = layers_11_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_47_cast_fp16)[name = tensor<string, []>("op_4465_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_23_cast_fp16 = add(x = var_4459_cast_fp16, y = var_4465_cast_fp16)[name = tensor<string, []>("key_23_cast_fp16")];
             tensor<string, []> var_4475_pad_type_0 = const()[name = tensor<string, []>("op_4475_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3147,33 +3147,33 @@ program(1.0)
             tensor<int32, [4]> var_4475_pad_0 = const()[name = tensor<string, []>("op_4475_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4475_dilations_0 = const()[name = tensor<string, []>("op_4475_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4475_groups_0 = const()[name = tensor<string, []>("op_4475_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(194831808))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(195356160))), name = tensor<string, []>("layers_11_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(267715904))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268502400))), name = tensor<string, []>("layers_11_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4475_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4475_dilations_0, groups = var_4475_groups_0, pad = var_4475_pad_0, pad_type = var_4475_pad_type_0, strides = var_4475_strides_0, weight = layers_11_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_47_cast_fp16)[name = tensor<string, []>("op_4475_cast_fp16")];
             tensor<string, []> var_4481_pad_type_0 = const()[name = tensor<string, []>("op_4481_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4481_strides_0 = const()[name = tensor<string, []>("op_4481_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4481_pad_0 = const()[name = tensor<string, []>("op_4481_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4481_dilations_0 = const()[name = tensor<string, []>("op_4481_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4481_groups_0 = const()[name = tensor<string, []>("op_4481_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(195387712))), name = tensor<string, []>("layers_11_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15654]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(195356288))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268534016))), name = tensor<string, []>("layers_11_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15654]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268502592))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4481_cast_fp16 = conv(dilations = var_4481_dilations_0, groups = var_4481_groups_0, pad = var_4481_pad_0, pad_type = var_4481_pad_type_0, strides = var_4481_strides_0, weight = layers_11_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_47_cast_fp16)[name = tensor<string, []>("op_4481_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_23_cast_fp16 = add(x = var_4475_cast_fp16, y = var_4481_cast_fp16)[name = tensor<string, []>("value_23_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_4484_to_fp16 = const()[name = tensor<string, []>("op_4484_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(195518848)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_4484_to_fp16 = const()[name = tensor<string, []>("op_4484_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268665152)))];
             tensor<fp16, [1, 1024, 1, 188]> query_47_cast_fp16 = add(x = query_45_cast_fp16, y = var_4484_to_fp16)[name = tensor<string, []>("query_47_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_4487_to_fp16 = const()[name = tensor<string, []>("op_4487_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(195520960)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_4487_to_fp16 = const()[name = tensor<string, []>("op_4487_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268667264)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_23_cast_fp16 = add(x = query_45_cast_fp16, y = var_4487_to_fp16)[name = tensor<string, []>("q_with_bias_v_23_cast_fp16")];
             tensor<string, []> var_4497_pad_type_0 = const()[name = tensor<string, []>("op_4497_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4497_strides_0 = const()[name = tensor<string, []>("op_4497_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4497_pad_0 = const()[name = tensor<string, []>("op_4497_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4497_dilations_0 = const()[name = tensor<string, []>("op_4497_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4497_groups_0 = const()[name = tensor<string, []>("op_4497_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(195523072))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196047424))), name = tensor<string, []>("layers_11_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268669376))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(269455872))), name = tensor<string, []>("layers_11_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_4497_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4497_dilations_0, groups = var_4497_groups_0, pad = var_4497_pad_0, pad_type = var_4497_pad_type_0, strides = var_4497_strides_0, weight = layers_11_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_4497_cast_fp16")];
             tensor<string, []> var_4503_pad_type_0 = const()[name = tensor<string, []>("op_4503_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4503_strides_0 = const()[name = tensor<string, []>("op_4503_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4503_pad_0 = const()[name = tensor<string, []>("op_4503_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4503_dilations_0 = const()[name = tensor<string, []>("op_4503_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4503_groups_0 = const()[name = tensor<string, []>("op_4503_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196105536))), name = tensor<string, []>("layers_11_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [28939]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196047552))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(269514048))), name = tensor<string, []>("layers_11_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [28939]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(269456064))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_4503_cast_fp16 = conv(dilations = var_4503_dilations_0, groups = var_4503_groups_0, pad = var_4503_pad_0, pad_type = var_4503_pad_type_0, strides = var_4503_strides_0, weight = layers_11_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_4503_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_23_cast_fp16 = add(x = var_4497_cast_fp16, y = var_4503_cast_fp16)[name = tensor<string, []>("p_23_cast_fp16")];
             tensor<int32, [4]> var_4507 = const()[name = tensor<string, []>("op_4507"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -3224,22 +3224,22 @@ program(1.0)
             tensor<int32, [4]> var_4560_pad_0 = const()[name = tensor<string, []>("op_4560_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4560_dilations_0 = const()[name = tensor<string, []>("op_4560_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4560_groups_0 = const()[name = tensor<string, []>("op_4560_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196236672))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196761024))), name = tensor<string, []>("layers_11_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(269645184))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(270431680))), name = tensor<string, []>("layers_11_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4560_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4560_dilations_0, groups = var_4560_groups_0, pad = var_4560_pad_0, pad_type = var_4560_pad_type_0, strides = var_4560_strides_0, weight = layers_11_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_309_cast_fp16)[name = tensor<string, []>("op_4560_cast_fp16")];
             tensor<string, []> var_4566_pad_type_0 = const()[name = tensor<string, []>("op_4566_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4566_strides_0 = const()[name = tensor<string, []>("op_4566_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4566_pad_0 = const()[name = tensor<string, []>("op_4566_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4566_dilations_0 = const()[name = tensor<string, []>("op_4566_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4566_groups_0 = const()[name = tensor<string, []>("op_4566_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196793664))), name = tensor<string, []>("layers_11_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16205]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196761152))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(270464384))), name = tensor<string, []>("layers_11_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16205]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(270431872))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4566_cast_fp16 = conv(dilations = var_4566_dilations_0, groups = var_4566_groups_0, pad = var_4566_pad_0, pad_type = var_4566_pad_type_0, strides = var_4566_strides_0, weight = layers_11_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_309_cast_fp16)[name = tensor<string, []>("op_4566_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_49_cast_fp16 = add(x = var_4560_cast_fp16, y = var_4566_cast_fp16)[name = tensor<string, []>("obj_49_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_115_cast_fp16 = add(x = inputs_113_cast_fp16, y = obj_49_cast_fp16)[name = tensor<string, []>("inputs_115_cast_fp16")];
             tensor<int32, [1]> out_115_axes_0 = const()[name = tensor<string, []>("out_115_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4577_to_fp16 = const()[name = tensor<string, []>("op_4577_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_115_cast_fp16 = layer_norm(axes = out_115_axes_0, epsilon = var_4577_to_fp16, x = inputs_115_cast_fp16)[name = tensor<string, []>("out_115_cast_fp16")];
-            tensor<fp16, [1024]> input_311_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_311_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196924800)))];
-            tensor<fp16, [1024]> input_311_beta_0_to_fp16 = const()[name = tensor<string, []>("input_311_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196926912)))];
+            tensor<fp16, [1024]> input_311_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_311_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(270595520)))];
+            tensor<fp16, [1024]> input_311_beta_0_to_fp16 = const()[name = tensor<string, []>("input_311_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(270597632)))];
             tensor<fp16, []> input_311_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_311_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_311_cast_fp16 = batch_norm(beta = input_311_beta_0_to_fp16, epsilon = input_311_epsilon_0_to_fp16, gamma = input_311_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_115_cast_fp16)[name = tensor<string, []>("input_311_cast_fp16")];
             tensor<string, []> var_4598_pad_type_0 = const()[name = tensor<string, []>("op_4598_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3247,14 +3247,14 @@ program(1.0)
             tensor<int32, [4]> var_4598_pad_0 = const()[name = tensor<string, []>("op_4598_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4598_dilations_0 = const()[name = tensor<string, []>("op_4598_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4598_groups_0 = const()[name = tensor<string, []>("op_4598_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_11_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(196929024))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(197977664))), name = tensor<string, []>("layers_11_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_11_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(270599744))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(272172672))), name = tensor<string, []>("layers_11_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_4598_cast_fp16 = conv(dilations = var_4598_dilations_0, groups = var_4598_groups_0, pad = var_4598_pad_0, pad_type = var_4598_pad_type_0, strides = var_4598_strides_0, weight = layers_11_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_311_cast_fp16)[name = tensor<string, []>("op_4598_cast_fp16")];
             tensor<string, []> var_4604_pad_type_0 = const()[name = tensor<string, []>("op_4604_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4604_strides_0 = const()[name = tensor<string, []>("op_4604_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4604_pad_0 = const()[name = tensor<string, []>("op_4604_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4604_dilations_0 = const()[name = tensor<string, []>("op_4604_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4604_groups_0 = const()[name = tensor<string, []>("op_4604_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_11_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198038144))), name = tensor<string, []>("layers_11_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [30122]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(197977792))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_11_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(272233216))), name = tensor<string, []>("layers_11_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [30122]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(272172864))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_4604_cast_fp16 = conv(dilations = var_4604_dilations_0, groups = var_4604_groups_0, pad = var_4604_pad_0, pad_type = var_4604_pad_type_0, strides = var_4604_strides_0, weight = layers_11_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_311_cast_fp16)[name = tensor<string, []>("op_4604_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_313_cast_fp16 = add(x = var_4598_cast_fp16, y = var_4604_cast_fp16)[name = tensor<string, []>("input_313_cast_fp16")];
             tensor<int32, []> input_315_split_num_splits_0 = const()[name = tensor<string, []>("input_315_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -3267,8 +3267,8 @@ program(1.0)
             tensor<int32, []> input_317_groups_0 = const()[name = tensor<string, []>("input_317_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_317_strides_0 = const()[name = tensor<string, []>("input_317_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_317_dilations_0 = const()[name = tensor<string, []>("input_317_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_290_to_fp16 = const()[name = tensor<string, []>("const_290_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198300352)))];
-            tensor<fp16, [1024]> const_291_to_fp16 = const()[name = tensor<string, []>("const_291_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198318848)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_290_to_fp16 = const()[name = tensor<string, []>("const_290_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(272495424)))];
+            tensor<fp16, [1024]> const_291_to_fp16 = const()[name = tensor<string, []>("const_291_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(272513920)))];
             tensor<fp16, [1, 1024, 1, 188]> input_319_cast_fp16 = conv(bias = const_291_to_fp16, dilations = input_317_dilations_0, groups = input_317_groups_0, pad = input_317_pad_0, pad_type = input_317_pad_type_0, strides = input_317_strides_0, weight = const_290_to_fp16, x = input_315_cast_fp16)[name = tensor<string, []>("input_319_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_321_cast_fp16 = silu(x = input_319_cast_fp16)[name = tensor<string, []>("input_321_cast_fp16")];
             tensor<string, []> var_4626_pad_type_0 = const()[name = tensor<string, []>("op_4626_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3276,22 +3276,22 @@ program(1.0)
             tensor<int32, [4]> var_4626_pad_0 = const()[name = tensor<string, []>("op_4626_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4626_dilations_0 = const()[name = tensor<string, []>("op_4626_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4626_groups_0 = const()[name = tensor<string, []>("op_4626_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198320960))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198845312))), name = tensor<string, []>("layers_11_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(272516032))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(273302528))), name = tensor<string, []>("layers_11_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4626_cast_fp16 = conv(dilations = var_4626_dilations_0, groups = var_4626_groups_0, pad = var_4626_pad_0, pad_type = var_4626_pad_type_0, strides = var_4626_strides_0, weight = layers_11_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_321_cast_fp16)[name = tensor<string, []>("op_4626_cast_fp16")];
             tensor<string, []> var_4632_pad_type_0 = const()[name = tensor<string, []>("op_4632_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4632_strides_0 = const()[name = tensor<string, []>("op_4632_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4632_pad_0 = const()[name = tensor<string, []>("op_4632_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4632_dilations_0 = const()[name = tensor<string, []>("op_4632_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4632_groups_0 = const()[name = tensor<string, []>("op_4632_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_11_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198877312))), name = tensor<string, []>("layers_11_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15904]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(198845440))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_11_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(273334592))), name = tensor<string, []>("layers_11_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15904]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(273302720))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4632_cast_fp16 = conv(dilations = var_4632_dilations_0, groups = var_4632_groups_0, pad = var_4632_pad_0, pad_type = var_4632_pad_type_0, strides = var_4632_strides_0, weight = layers_11_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_321_cast_fp16)[name = tensor<string, []>("op_4632_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_71_cast_fp16 = add(x = var_4626_cast_fp16, y = var_4632_cast_fp16)[name = tensor<string, []>("x_71_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_117_cast_fp16 = add(x = inputs_115_cast_fp16, y = x_71_cast_fp16)[name = tensor<string, []>("inputs_117_cast_fp16")];
             tensor<int32, [1]> out_117_axes_0 = const()[name = tensor<string, []>("out_117_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4643_to_fp16 = const()[name = tensor<string, []>("op_4643_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_117_cast_fp16 = layer_norm(axes = out_117_axes_0, epsilon = var_4643_to_fp16, x = inputs_117_cast_fp16)[name = tensor<string, []>("out_117_cast_fp16")];
-            tensor<fp16, [1024]> input_323_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_323_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(199008448)))];
-            tensor<fp16, [1024]> input_323_beta_0_to_fp16 = const()[name = tensor<string, []>("input_323_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(199010560)))];
+            tensor<fp16, [1024]> input_323_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_323_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(273465728)))];
+            tensor<fp16, [1024]> input_323_beta_0_to_fp16 = const()[name = tensor<string, []>("input_323_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(273467840)))];
             tensor<fp16, []> input_323_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_323_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_323_cast_fp16 = batch_norm(beta = input_323_beta_0_to_fp16, epsilon = input_323_epsilon_0_to_fp16, gamma = input_323_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_117_cast_fp16)[name = tensor<string, []>("input_323_cast_fp16")];
             tensor<string, []> var_4663_pad_type_0 = const()[name = tensor<string, []>("op_4663_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3299,14 +3299,14 @@ program(1.0)
             tensor<int32, [4]> var_4663_pad_0 = const()[name = tensor<string, []>("op_4663_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4663_dilations_0 = const()[name = tensor<string, []>("op_4663_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4663_groups_0 = const()[name = tensor<string, []>("op_4663_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_11_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(199012672))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201109888))), name = tensor<string, []>("layers_11_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_11_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(273469952))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(276615744))), name = tensor<string, []>("layers_11_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4663_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_4663_dilations_0, groups = var_4663_groups_0, pad = var_4663_pad_0, pad_type = var_4663_pad_type_0, strides = var_4663_strides_0, weight = layers_11_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_323_cast_fp16)[name = tensor<string, []>("op_4663_cast_fp16")];
             tensor<string, []> var_4669_pad_type_0 = const()[name = tensor<string, []>("op_4669_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4669_strides_0 = const()[name = tensor<string, []>("op_4669_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4669_pad_0 = const()[name = tensor<string, []>("op_4669_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4669_dilations_0 = const()[name = tensor<string, []>("op_4669_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4669_groups_0 = const()[name = tensor<string, []>("op_4669_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_11_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201272768))), name = tensor<string, []>("layers_11_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81337]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201110016))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_11_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(276778688))), name = tensor<string, []>("layers_11_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81337]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(276615936))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4669_cast_fp16 = conv(dilations = var_4669_dilations_0, groups = var_4669_groups_0, pad = var_4669_pad_0, pad_type = var_4669_pad_type_0, strides = var_4669_strides_0, weight = layers_11_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_323_cast_fp16)[name = tensor<string, []>("op_4669_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_325_cast_fp16 = add(x = var_4663_cast_fp16, y = var_4669_cast_fp16)[name = tensor<string, []>("input_325_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_327_cast_fp16 = silu(x = input_325_cast_fp16)[name = tensor<string, []>("input_327_cast_fp16")];
@@ -3315,14 +3315,14 @@ program(1.0)
             tensor<int32, [4]> var_4680_pad_0 = const()[name = tensor<string, []>("op_4680_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4680_dilations_0 = const()[name = tensor<string, []>("op_4680_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4680_groups_0 = const()[name = tensor<string, []>("op_4680_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_11_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(201797120))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(203894336))), name = tensor<string, []>("layers_11_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_11_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(277303040))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(280448832))), name = tensor<string, []>("layers_11_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4680_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4680_dilations_0, groups = var_4680_groups_0, pad = var_4680_pad_0, pad_type = var_4680_pad_type_0, strides = var_4680_strides_0, weight = layers_11_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_327_cast_fp16)[name = tensor<string, []>("op_4680_cast_fp16")];
             tensor<string, []> var_4686_pad_type_0 = const()[name = tensor<string, []>("op_4686_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4686_strides_0 = const()[name = tensor<string, []>("op_4686_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4686_pad_0 = const()[name = tensor<string, []>("op_4686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4686_dilations_0 = const()[name = tensor<string, []>("op_4686_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4686_groups_0 = const()[name = tensor<string, []>("op_4686_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_11_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204061376))), name = tensor<string, []>("layers_11_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [83422]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(203894464))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_11_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(280615936))), name = tensor<string, []>("layers_11_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [83422]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(280449024))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4686_cast_fp16 = conv(dilations = var_4686_dilations_0, groups = var_4686_groups_0, pad = var_4686_pad_0, pad_type = var_4686_pad_type_0, strides = var_4686_strides_0, weight = layers_11_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_327_cast_fp16)[name = tensor<string, []>("op_4686_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_73_cast_fp16 = add(x = var_4680_cast_fp16, y = var_4686_cast_fp16)[name = tensor<string, []>("x_73_cast_fp16")];
             tensor<fp16, []> var_4688_to_fp16 = const()[name = tensor<string, []>("op_4688_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -3331,16 +3331,16 @@ program(1.0)
             tensor<int32, [1]> out_119_axes_0 = const()[name = tensor<string, []>("out_119_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4699_to_fp16 = const()[name = tensor<string, []>("op_4699_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_119_cast_fp16 = layer_norm(axes = out_119_axes_0, epsilon = var_4699_to_fp16, x = inputs_119_cast_fp16)[name = tensor<string, []>("out_119_cast_fp16")];
-            tensor<fp16, [1024]> inputs_121_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_121_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204585728)))];
-            tensor<fp16, [1024]> inputs_121_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_121_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204587840)))];
+            tensor<fp16, [1024]> inputs_121_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_121_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281140288)))];
+            tensor<fp16, [1024]> inputs_121_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_121_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281142400)))];
             tensor<fp16, []> inputs_121_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_121_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_121_cast_fp16 = batch_norm(beta = inputs_121_beta_0_to_fp16, epsilon = inputs_121_epsilon_0_to_fp16, gamma = inputs_121_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_119_cast_fp16)[name = tensor<string, []>("inputs_121_cast_fp16")];
             tensor<int32, []> var_4713 = const()[name = tensor<string, []>("op_4713"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_121_axes_0 = const()[name = tensor<string, []>("out_121_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4744_to_fp16 = const()[name = tensor<string, []>("op_4744_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_121_cast_fp16 = layer_norm(axes = out_121_axes_0, epsilon = var_4744_to_fp16, x = inputs_121_cast_fp16)[name = tensor<string, []>("out_121_cast_fp16")];
-            tensor<fp16, [1024]> input_329_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_329_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204589952)))];
-            tensor<fp16, [1024]> input_329_beta_0_to_fp16 = const()[name = tensor<string, []>("input_329_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204592064)))];
+            tensor<fp16, [1024]> input_329_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_329_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281144512)))];
+            tensor<fp16, [1024]> input_329_beta_0_to_fp16 = const()[name = tensor<string, []>("input_329_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281146624)))];
             tensor<fp16, []> input_329_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_329_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_329_cast_fp16 = batch_norm(beta = input_329_beta_0_to_fp16, epsilon = input_329_epsilon_0_to_fp16, gamma = input_329_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_121_cast_fp16)[name = tensor<string, []>("input_329_cast_fp16")];
             tensor<string, []> var_4764_pad_type_0 = const()[name = tensor<string, []>("op_4764_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3348,14 +3348,14 @@ program(1.0)
             tensor<int32, [4]> var_4764_pad_0 = const()[name = tensor<string, []>("op_4764_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4764_dilations_0 = const()[name = tensor<string, []>("op_4764_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4764_groups_0 = const()[name = tensor<string, []>("op_4764_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_12_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(204594176))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(206691392))), name = tensor<string, []>("layers_12_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_12_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281148736))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284294528))), name = tensor<string, []>("layers_12_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4764_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_4764_dilations_0, groups = var_4764_groups_0, pad = var_4764_pad_0, pad_type = var_4764_pad_type_0, strides = var_4764_strides_0, weight = layers_12_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_329_cast_fp16)[name = tensor<string, []>("op_4764_cast_fp16")];
             tensor<string, []> var_4770_pad_type_0 = const()[name = tensor<string, []>("op_4770_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4770_strides_0 = const()[name = tensor<string, []>("op_4770_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4770_pad_0 = const()[name = tensor<string, []>("op_4770_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4770_dilations_0 = const()[name = tensor<string, []>("op_4770_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4770_groups_0 = const()[name = tensor<string, []>("op_4770_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_12_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(206864576))), name = tensor<string, []>("layers_12_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [86491]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(206691520))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_12_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284467776))), name = tensor<string, []>("layers_12_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [86491]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284294720))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_4770_cast_fp16 = conv(dilations = var_4770_dilations_0, groups = var_4770_groups_0, pad = var_4770_pad_0, pad_type = var_4770_pad_type_0, strides = var_4770_strides_0, weight = layers_12_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_329_cast_fp16)[name = tensor<string, []>("op_4770_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_331_cast_fp16 = add(x = var_4764_cast_fp16, y = var_4770_cast_fp16)[name = tensor<string, []>("input_331_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_333_cast_fp16 = silu(x = input_331_cast_fp16)[name = tensor<string, []>("input_333_cast_fp16")];
@@ -3364,14 +3364,14 @@ program(1.0)
             tensor<int32, [4]> var_4781_pad_0 = const()[name = tensor<string, []>("op_4781_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4781_dilations_0 = const()[name = tensor<string, []>("op_4781_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4781_groups_0 = const()[name = tensor<string, []>("op_4781_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_12_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(207388928))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(209486144))), name = tensor<string, []>("layers_12_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_12_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284992128))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288137920))), name = tensor<string, []>("layers_12_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4781_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4781_dilations_0, groups = var_4781_groups_0, pad = var_4781_pad_0, pad_type = var_4781_pad_type_0, strides = var_4781_strides_0, weight = layers_12_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_333_cast_fp16)[name = tensor<string, []>("op_4781_cast_fp16")];
             tensor<string, []> var_4787_pad_type_0 = const()[name = tensor<string, []>("op_4787_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4787_strides_0 = const()[name = tensor<string, []>("op_4787_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4787_pad_0 = const()[name = tensor<string, []>("op_4787_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4787_dilations_0 = const()[name = tensor<string, []>("op_4787_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4787_groups_0 = const()[name = tensor<string, []>("op_4787_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_12_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(209646080))), name = tensor<string, []>("layers_12_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [79869]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(209486272))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_12_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288297920))), name = tensor<string, []>("layers_12_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [79869]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288138112))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4787_cast_fp16 = conv(dilations = var_4787_dilations_0, groups = var_4787_groups_0, pad = var_4787_pad_0, pad_type = var_4787_pad_type_0, strides = var_4787_strides_0, weight = layers_12_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_333_cast_fp16)[name = tensor<string, []>("op_4787_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_75_cast_fp16 = add(x = var_4781_cast_fp16, y = var_4787_cast_fp16)[name = tensor<string, []>("x_75_cast_fp16")];
             tensor<fp16, []> var_4789_to_fp16 = const()[name = tensor<string, []>("op_4789_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -3380,8 +3380,8 @@ program(1.0)
             tensor<int32, [1]> out_123_axes_0 = const()[name = tensor<string, []>("out_123_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4800_to_fp16 = const()[name = tensor<string, []>("op_4800_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_123_cast_fp16 = layer_norm(axes = out_123_axes_0, epsilon = var_4800_to_fp16, x = inputs_123_cast_fp16)[name = tensor<string, []>("out_123_cast_fp16")];
-            tensor<fp16, [1024]> obj_51_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_51_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(210170432)))];
-            tensor<fp16, [1024]> obj_51_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_51_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(210172544)))];
+            tensor<fp16, [1024]> obj_51_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_51_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288822272)))];
+            tensor<fp16, [1024]> obj_51_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_51_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288824384)))];
             tensor<fp16, []> obj_51_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_51_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_51_cast_fp16 = batch_norm(beta = obj_51_beta_0_to_fp16, epsilon = obj_51_epsilon_0_to_fp16, gamma = obj_51_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_123_cast_fp16)[name = tensor<string, []>("obj_51_cast_fp16")];
             tensor<string, []> var_4825_pad_type_0 = const()[name = tensor<string, []>("op_4825_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3389,14 +3389,14 @@ program(1.0)
             tensor<int32, [4]> var_4825_pad_0 = const()[name = tensor<string, []>("op_4825_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4825_dilations_0 = const()[name = tensor<string, []>("op_4825_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4825_groups_0 = const()[name = tensor<string, []>("op_4825_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(210174656))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(210699008))), name = tensor<string, []>("layers_12_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288826496))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(289612992))), name = tensor<string, []>("layers_12_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4825_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4825_dilations_0, groups = var_4825_groups_0, pad = var_4825_pad_0, pad_type = var_4825_pad_type_0, strides = var_4825_strides_0, weight = layers_12_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_51_cast_fp16)[name = tensor<string, []>("op_4825_cast_fp16")];
             tensor<string, []> var_4831_pad_type_0 = const()[name = tensor<string, []>("op_4831_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4831_strides_0 = const()[name = tensor<string, []>("op_4831_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4831_pad_0 = const()[name = tensor<string, []>("op_4831_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4831_dilations_0 = const()[name = tensor<string, []>("op_4831_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4831_groups_0 = const()[name = tensor<string, []>("op_4831_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(210730304))), name = tensor<string, []>("layers_12_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15537]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(210699136))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(289644352))), name = tensor<string, []>("layers_12_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15537]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(289613184))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4831_cast_fp16 = conv(dilations = var_4831_dilations_0, groups = var_4831_groups_0, pad = var_4831_pad_0, pad_type = var_4831_pad_type_0, strides = var_4831_strides_0, weight = layers_12_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_51_cast_fp16)[name = tensor<string, []>("op_4831_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_49_cast_fp16 = add(x = var_4825_cast_fp16, y = var_4831_cast_fp16)[name = tensor<string, []>("query_49_cast_fp16")];
             tensor<string, []> var_4840_pad_type_0 = const()[name = tensor<string, []>("op_4840_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3404,14 +3404,14 @@ program(1.0)
             tensor<int32, [4]> var_4840_pad_0 = const()[name = tensor<string, []>("op_4840_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4840_dilations_0 = const()[name = tensor<string, []>("op_4840_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4840_groups_0 = const()[name = tensor<string, []>("op_4840_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(210861440))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(211385792))), name = tensor<string, []>("layers_12_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(289775488))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(290561984))), name = tensor<string, []>("layers_12_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4840_cast_fp16 = conv(dilations = var_4840_dilations_0, groups = var_4840_groups_0, pad = var_4840_pad_0, pad_type = var_4840_pad_type_0, strides = var_4840_strides_0, weight = layers_12_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_51_cast_fp16)[name = tensor<string, []>("op_4840_cast_fp16")];
             tensor<string, []> var_4846_pad_type_0 = const()[name = tensor<string, []>("op_4846_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4846_strides_0 = const()[name = tensor<string, []>("op_4846_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4846_pad_0 = const()[name = tensor<string, []>("op_4846_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4846_dilations_0 = const()[name = tensor<string, []>("op_4846_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4846_groups_0 = const()[name = tensor<string, []>("op_4846_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(211418880))), name = tensor<string, []>("layers_12_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16444]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(211385920))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(290595136))), name = tensor<string, []>("layers_12_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16444]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(290562176))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4846_cast_fp16 = conv(dilations = var_4846_dilations_0, groups = var_4846_groups_0, pad = var_4846_pad_0, pad_type = var_4846_pad_type_0, strides = var_4846_strides_0, weight = layers_12_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_51_cast_fp16)[name = tensor<string, []>("op_4846_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_25_cast_fp16 = add(x = var_4840_cast_fp16, y = var_4846_cast_fp16)[name = tensor<string, []>("key_25_cast_fp16")];
             tensor<string, []> var_4856_pad_type_0 = const()[name = tensor<string, []>("op_4856_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3419,33 +3419,33 @@ program(1.0)
             tensor<int32, [4]> var_4856_pad_0 = const()[name = tensor<string, []>("op_4856_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4856_dilations_0 = const()[name = tensor<string, []>("op_4856_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4856_groups_0 = const()[name = tensor<string, []>("op_4856_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(211550016))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212074368))), name = tensor<string, []>("layers_12_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(290726272))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(291512768))), name = tensor<string, []>("layers_12_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4856_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4856_dilations_0, groups = var_4856_groups_0, pad = var_4856_pad_0, pad_type = var_4856_pad_type_0, strides = var_4856_strides_0, weight = layers_12_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_51_cast_fp16)[name = tensor<string, []>("op_4856_cast_fp16")];
             tensor<string, []> var_4862_pad_type_0 = const()[name = tensor<string, []>("op_4862_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4862_strides_0 = const()[name = tensor<string, []>("op_4862_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4862_pad_0 = const()[name = tensor<string, []>("op_4862_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4862_dilations_0 = const()[name = tensor<string, []>("op_4862_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4862_groups_0 = const()[name = tensor<string, []>("op_4862_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212105280))), name = tensor<string, []>("layers_12_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15338]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212074496))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(291543744))), name = tensor<string, []>("layers_12_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15338]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(291512960))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4862_cast_fp16 = conv(dilations = var_4862_dilations_0, groups = var_4862_groups_0, pad = var_4862_pad_0, pad_type = var_4862_pad_type_0, strides = var_4862_strides_0, weight = layers_12_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_51_cast_fp16)[name = tensor<string, []>("op_4862_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_25_cast_fp16 = add(x = var_4856_cast_fp16, y = var_4862_cast_fp16)[name = tensor<string, []>("value_25_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_4865_to_fp16 = const()[name = tensor<string, []>("op_4865_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212236416)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_4865_to_fp16 = const()[name = tensor<string, []>("op_4865_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(291674880)))];
             tensor<fp16, [1, 1024, 1, 188]> query_51_cast_fp16 = add(x = query_49_cast_fp16, y = var_4865_to_fp16)[name = tensor<string, []>("query_51_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_4868_to_fp16 = const()[name = tensor<string, []>("op_4868_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212238528)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_4868_to_fp16 = const()[name = tensor<string, []>("op_4868_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(291676992)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_25_cast_fp16 = add(x = query_49_cast_fp16, y = var_4868_to_fp16)[name = tensor<string, []>("q_with_bias_v_25_cast_fp16")];
             tensor<string, []> var_4878_pad_type_0 = const()[name = tensor<string, []>("op_4878_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4878_strides_0 = const()[name = tensor<string, []>("op_4878_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4878_pad_0 = const()[name = tensor<string, []>("op_4878_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4878_dilations_0 = const()[name = tensor<string, []>("op_4878_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4878_groups_0 = const()[name = tensor<string, []>("op_4878_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212240640))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212764992))), name = tensor<string, []>("layers_12_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(291679104))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(292465600))), name = tensor<string, []>("layers_12_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_4878_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4878_dilations_0, groups = var_4878_groups_0, pad = var_4878_pad_0, pad_type = var_4878_pad_type_0, strides = var_4878_strides_0, weight = layers_12_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_4878_cast_fp16")];
             tensor<string, []> var_4884_pad_type_0 = const()[name = tensor<string, []>("op_4884_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4884_strides_0 = const()[name = tensor<string, []>("op_4884_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4884_pad_0 = const()[name = tensor<string, []>("op_4884_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4884_dilations_0 = const()[name = tensor<string, []>("op_4884_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4884_groups_0 = const()[name = tensor<string, []>("op_4884_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212832448))), name = tensor<string, []>("layers_12_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33616]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212765120))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(292533120))), name = tensor<string, []>("layers_12_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33616]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(292465792))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_4884_cast_fp16 = conv(dilations = var_4884_dilations_0, groups = var_4884_groups_0, pad = var_4884_pad_0, pad_type = var_4884_pad_type_0, strides = var_4884_strides_0, weight = layers_12_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_4884_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_25_cast_fp16 = add(x = var_4878_cast_fp16, y = var_4884_cast_fp16)[name = tensor<string, []>("p_25_cast_fp16")];
             tensor<int32, [4]> var_4888 = const()[name = tensor<string, []>("op_4888"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -3496,22 +3496,22 @@ program(1.0)
             tensor<int32, [4]> var_4941_pad_0 = const()[name = tensor<string, []>("op_4941_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4941_dilations_0 = const()[name = tensor<string, []>("op_4941_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4941_groups_0 = const()[name = tensor<string, []>("op_4941_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(212963584))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(213487936))), name = tensor<string, []>("layers_12_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(292664256))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293450752))), name = tensor<string, []>("layers_12_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4941_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_4941_dilations_0, groups = var_4941_groups_0, pad = var_4941_pad_0, pad_type = var_4941_pad_type_0, strides = var_4941_strides_0, weight = layers_12_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_335_cast_fp16)[name = tensor<string, []>("op_4941_cast_fp16")];
             tensor<string, []> var_4947_pad_type_0 = const()[name = tensor<string, []>("op_4947_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4947_strides_0 = const()[name = tensor<string, []>("op_4947_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4947_pad_0 = const()[name = tensor<string, []>("op_4947_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4947_dilations_0 = const()[name = tensor<string, []>("op_4947_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4947_groups_0 = const()[name = tensor<string, []>("op_4947_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(213521152))), name = tensor<string, []>("layers_12_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16509]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(213488064))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293484032))), name = tensor<string, []>("layers_12_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16509]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293450944))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_4947_cast_fp16 = conv(dilations = var_4947_dilations_0, groups = var_4947_groups_0, pad = var_4947_pad_0, pad_type = var_4947_pad_type_0, strides = var_4947_strides_0, weight = layers_12_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_335_cast_fp16)[name = tensor<string, []>("op_4947_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_53_cast_fp16 = add(x = var_4941_cast_fp16, y = var_4947_cast_fp16)[name = tensor<string, []>("obj_53_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_125_cast_fp16 = add(x = inputs_123_cast_fp16, y = obj_53_cast_fp16)[name = tensor<string, []>("inputs_125_cast_fp16")];
             tensor<int32, [1]> out_125_axes_0 = const()[name = tensor<string, []>("out_125_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_4958_to_fp16 = const()[name = tensor<string, []>("op_4958_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_125_cast_fp16 = layer_norm(axes = out_125_axes_0, epsilon = var_4958_to_fp16, x = inputs_125_cast_fp16)[name = tensor<string, []>("out_125_cast_fp16")];
-            tensor<fp16, [1024]> input_337_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_337_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(213652288)))];
-            tensor<fp16, [1024]> input_337_beta_0_to_fp16 = const()[name = tensor<string, []>("input_337_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(213654400)))];
+            tensor<fp16, [1024]> input_337_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_337_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293615168)))];
+            tensor<fp16, [1024]> input_337_beta_0_to_fp16 = const()[name = tensor<string, []>("input_337_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293617280)))];
             tensor<fp16, []> input_337_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_337_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_337_cast_fp16 = batch_norm(beta = input_337_beta_0_to_fp16, epsilon = input_337_epsilon_0_to_fp16, gamma = input_337_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_125_cast_fp16)[name = tensor<string, []>("input_337_cast_fp16")];
             tensor<string, []> var_4979_pad_type_0 = const()[name = tensor<string, []>("op_4979_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3519,14 +3519,14 @@ program(1.0)
             tensor<int32, [4]> var_4979_pad_0 = const()[name = tensor<string, []>("op_4979_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4979_dilations_0 = const()[name = tensor<string, []>("op_4979_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4979_groups_0 = const()[name = tensor<string, []>("op_4979_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_12_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(213656512))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(214705152))), name = tensor<string, []>("layers_12_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_12_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293619392))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295192320))), name = tensor<string, []>("layers_12_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_4979_cast_fp16 = conv(dilations = var_4979_dilations_0, groups = var_4979_groups_0, pad = var_4979_pad_0, pad_type = var_4979_pad_type_0, strides = var_4979_strides_0, weight = layers_12_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_337_cast_fp16)[name = tensor<string, []>("op_4979_cast_fp16")];
             tensor<string, []> var_4985_pad_type_0 = const()[name = tensor<string, []>("op_4985_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_4985_strides_0 = const()[name = tensor<string, []>("op_4985_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_4985_pad_0 = const()[name = tensor<string, []>("op_4985_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_4985_dilations_0 = const()[name = tensor<string, []>("op_4985_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_4985_groups_0 = const()[name = tensor<string, []>("op_4985_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_12_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(214768768))), name = tensor<string, []>("layers_12_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31699]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(214705280))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_12_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295256000))), name = tensor<string, []>("layers_12_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31699]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295192512))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_4985_cast_fp16 = conv(dilations = var_4985_dilations_0, groups = var_4985_groups_0, pad = var_4985_pad_0, pad_type = var_4985_pad_type_0, strides = var_4985_strides_0, weight = layers_12_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_337_cast_fp16)[name = tensor<string, []>("op_4985_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_339_cast_fp16 = add(x = var_4979_cast_fp16, y = var_4985_cast_fp16)[name = tensor<string, []>("input_339_cast_fp16")];
             tensor<int32, []> input_341_split_num_splits_0 = const()[name = tensor<string, []>("input_341_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -3539,8 +3539,8 @@ program(1.0)
             tensor<int32, []> input_343_groups_0 = const()[name = tensor<string, []>("input_343_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_343_strides_0 = const()[name = tensor<string, []>("input_343_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_343_dilations_0 = const()[name = tensor<string, []>("input_343_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_292_to_fp16 = const()[name = tensor<string, []>("const_292_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215030976)))];
-            tensor<fp16, [1024]> const_293_to_fp16 = const()[name = tensor<string, []>("const_293_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215049472)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_292_to_fp16 = const()[name = tensor<string, []>("const_292_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295518208)))];
+            tensor<fp16, [1024]> const_293_to_fp16 = const()[name = tensor<string, []>("const_293_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295536704)))];
             tensor<fp16, [1, 1024, 1, 188]> input_345_cast_fp16 = conv(bias = const_293_to_fp16, dilations = input_343_dilations_0, groups = input_343_groups_0, pad = input_343_pad_0, pad_type = input_343_pad_type_0, strides = input_343_strides_0, weight = const_292_to_fp16, x = input_341_cast_fp16)[name = tensor<string, []>("input_345_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_347_cast_fp16 = silu(x = input_345_cast_fp16)[name = tensor<string, []>("input_347_cast_fp16")];
             tensor<string, []> var_5007_pad_type_0 = const()[name = tensor<string, []>("op_5007_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3548,22 +3548,22 @@ program(1.0)
             tensor<int32, [4]> var_5007_pad_0 = const()[name = tensor<string, []>("op_5007_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5007_dilations_0 = const()[name = tensor<string, []>("op_5007_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5007_groups_0 = const()[name = tensor<string, []>("op_5007_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215051584))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215575936))), name = tensor<string, []>("layers_12_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295538816))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296325312))), name = tensor<string, []>("layers_12_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5007_cast_fp16 = conv(dilations = var_5007_dilations_0, groups = var_5007_groups_0, pad = var_5007_pad_0, pad_type = var_5007_pad_type_0, strides = var_5007_strides_0, weight = layers_12_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_347_cast_fp16)[name = tensor<string, []>("op_5007_cast_fp16")];
             tensor<string, []> var_5013_pad_type_0 = const()[name = tensor<string, []>("op_5013_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5013_strides_0 = const()[name = tensor<string, []>("op_5013_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5013_pad_0 = const()[name = tensor<string, []>("op_5013_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5013_dilations_0 = const()[name = tensor<string, []>("op_5013_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5013_groups_0 = const()[name = tensor<string, []>("op_5013_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_12_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215608000))), name = tensor<string, []>("layers_12_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15909]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215576064))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_12_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296357440))), name = tensor<string, []>("layers_12_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15909]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296325504))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5013_cast_fp16 = conv(dilations = var_5013_dilations_0, groups = var_5013_groups_0, pad = var_5013_pad_0, pad_type = var_5013_pad_type_0, strides = var_5013_strides_0, weight = layers_12_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_347_cast_fp16)[name = tensor<string, []>("op_5013_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_77_cast_fp16 = add(x = var_5007_cast_fp16, y = var_5013_cast_fp16)[name = tensor<string, []>("x_77_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_127_cast_fp16 = add(x = inputs_125_cast_fp16, y = x_77_cast_fp16)[name = tensor<string, []>("inputs_127_cast_fp16")];
             tensor<int32, [1]> out_127_axes_0 = const()[name = tensor<string, []>("out_127_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5024_to_fp16 = const()[name = tensor<string, []>("op_5024_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_127_cast_fp16 = layer_norm(axes = out_127_axes_0, epsilon = var_5024_to_fp16, x = inputs_127_cast_fp16)[name = tensor<string, []>("out_127_cast_fp16")];
-            tensor<fp16, [1024]> input_349_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_349_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215739136)))];
-            tensor<fp16, [1024]> input_349_beta_0_to_fp16 = const()[name = tensor<string, []>("input_349_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215741248)))];
+            tensor<fp16, [1024]> input_349_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_349_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296488576)))];
+            tensor<fp16, [1024]> input_349_beta_0_to_fp16 = const()[name = tensor<string, []>("input_349_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296490688)))];
             tensor<fp16, []> input_349_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_349_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_349_cast_fp16 = batch_norm(beta = input_349_beta_0_to_fp16, epsilon = input_349_epsilon_0_to_fp16, gamma = input_349_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_127_cast_fp16)[name = tensor<string, []>("input_349_cast_fp16")];
             tensor<string, []> var_5044_pad_type_0 = const()[name = tensor<string, []>("op_5044_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3571,14 +3571,14 @@ program(1.0)
             tensor<int32, [4]> var_5044_pad_0 = const()[name = tensor<string, []>("op_5044_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5044_dilations_0 = const()[name = tensor<string, []>("op_5044_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5044_groups_0 = const()[name = tensor<string, []>("op_5044_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_12_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(215743360))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(217840576))), name = tensor<string, []>("layers_12_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_12_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296492800))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(299638592))), name = tensor<string, []>("layers_12_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5044_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_5044_dilations_0, groups = var_5044_groups_0, pad = var_5044_pad_0, pad_type = var_5044_pad_type_0, strides = var_5044_strides_0, weight = layers_12_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_349_cast_fp16)[name = tensor<string, []>("op_5044_cast_fp16")];
             tensor<string, []> var_5050_pad_type_0 = const()[name = tensor<string, []>("op_5050_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5050_strides_0 = const()[name = tensor<string, []>("op_5050_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5050_pad_0 = const()[name = tensor<string, []>("op_5050_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5050_dilations_0 = const()[name = tensor<string, []>("op_5050_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5050_groups_0 = const()[name = tensor<string, []>("op_5050_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_12_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(217999552))), name = tensor<string, []>("layers_12_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [79384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(217840704))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_12_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(299797632))), name = tensor<string, []>("layers_12_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [79384]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(299638784))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5050_cast_fp16 = conv(dilations = var_5050_dilations_0, groups = var_5050_groups_0, pad = var_5050_pad_0, pad_type = var_5050_pad_type_0, strides = var_5050_strides_0, weight = layers_12_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_349_cast_fp16)[name = tensor<string, []>("op_5050_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_351_cast_fp16 = add(x = var_5044_cast_fp16, y = var_5050_cast_fp16)[name = tensor<string, []>("input_351_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_353_cast_fp16 = silu(x = input_351_cast_fp16)[name = tensor<string, []>("input_353_cast_fp16")];
@@ -3587,14 +3587,14 @@ program(1.0)
             tensor<int32, [4]> var_5061_pad_0 = const()[name = tensor<string, []>("op_5061_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5061_dilations_0 = const()[name = tensor<string, []>("op_5061_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5061_groups_0 = const()[name = tensor<string, []>("op_5061_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_12_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(218523904))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220621120))), name = tensor<string, []>("layers_12_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_12_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(300321984))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303467776))), name = tensor<string, []>("layers_12_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5061_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5061_dilations_0, groups = var_5061_groups_0, pad = var_5061_pad_0, pad_type = var_5061_pad_type_0, strides = var_5061_strides_0, weight = layers_12_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_353_cast_fp16)[name = tensor<string, []>("op_5061_cast_fp16")];
             tensor<string, []> var_5067_pad_type_0 = const()[name = tensor<string, []>("op_5067_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5067_strides_0 = const()[name = tensor<string, []>("op_5067_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5067_pad_0 = const()[name = tensor<string, []>("op_5067_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5067_dilations_0 = const()[name = tensor<string, []>("op_5067_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5067_groups_0 = const()[name = tensor<string, []>("op_5067_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_12_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220788224))), name = tensor<string, []>("layers_12_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [83438]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(220621248))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_12_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303634944))), name = tensor<string, []>("layers_12_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [83438]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(303467968))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5067_cast_fp16 = conv(dilations = var_5067_dilations_0, groups = var_5067_groups_0, pad = var_5067_pad_0, pad_type = var_5067_pad_type_0, strides = var_5067_strides_0, weight = layers_12_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_353_cast_fp16)[name = tensor<string, []>("op_5067_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_79_cast_fp16 = add(x = var_5061_cast_fp16, y = var_5067_cast_fp16)[name = tensor<string, []>("x_79_cast_fp16")];
             tensor<fp16, []> var_5069_to_fp16 = const()[name = tensor<string, []>("op_5069_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -3603,16 +3603,16 @@ program(1.0)
             tensor<int32, [1]> out_129_axes_0 = const()[name = tensor<string, []>("out_129_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5080_to_fp16 = const()[name = tensor<string, []>("op_5080_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_129_cast_fp16 = layer_norm(axes = out_129_axes_0, epsilon = var_5080_to_fp16, x = inputs_129_cast_fp16)[name = tensor<string, []>("out_129_cast_fp16")];
-            tensor<fp16, [1024]> inputs_131_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_131_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(221312576)))];
-            tensor<fp16, [1024]> inputs_131_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_131_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(221314688)))];
+            tensor<fp16, [1024]> inputs_131_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_131_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304159296)))];
+            tensor<fp16, [1024]> inputs_131_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_131_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304161408)))];
             tensor<fp16, []> inputs_131_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_131_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_131_cast_fp16 = batch_norm(beta = inputs_131_beta_0_to_fp16, epsilon = inputs_131_epsilon_0_to_fp16, gamma = inputs_131_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_129_cast_fp16)[name = tensor<string, []>("inputs_131_cast_fp16")];
             tensor<int32, []> var_5094 = const()[name = tensor<string, []>("op_5094"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_131_axes_0 = const()[name = tensor<string, []>("out_131_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5125_to_fp16 = const()[name = tensor<string, []>("op_5125_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_131_cast_fp16 = layer_norm(axes = out_131_axes_0, epsilon = var_5125_to_fp16, x = inputs_131_cast_fp16)[name = tensor<string, []>("out_131_cast_fp16")];
-            tensor<fp16, [1024]> input_355_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_355_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(221316800)))];
-            tensor<fp16, [1024]> input_355_beta_0_to_fp16 = const()[name = tensor<string, []>("input_355_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(221318912)))];
+            tensor<fp16, [1024]> input_355_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_355_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304163520)))];
+            tensor<fp16, [1024]> input_355_beta_0_to_fp16 = const()[name = tensor<string, []>("input_355_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304165632)))];
             tensor<fp16, []> input_355_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_355_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_355_cast_fp16 = batch_norm(beta = input_355_beta_0_to_fp16, epsilon = input_355_epsilon_0_to_fp16, gamma = input_355_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_131_cast_fp16)[name = tensor<string, []>("input_355_cast_fp16")];
             tensor<string, []> var_5145_pad_type_0 = const()[name = tensor<string, []>("op_5145_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3620,14 +3620,14 @@ program(1.0)
             tensor<int32, [4]> var_5145_pad_0 = const()[name = tensor<string, []>("op_5145_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5145_dilations_0 = const()[name = tensor<string, []>("op_5145_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5145_groups_0 = const()[name = tensor<string, []>("op_5145_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_13_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(221321024))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(223418240))), name = tensor<string, []>("layers_13_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_13_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304167744))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(307313536))), name = tensor<string, []>("layers_13_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5145_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_5145_dilations_0, groups = var_5145_groups_0, pad = var_5145_pad_0, pad_type = var_5145_pad_type_0, strides = var_5145_strides_0, weight = layers_13_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_355_cast_fp16)[name = tensor<string, []>("op_5145_cast_fp16")];
             tensor<string, []> var_5151_pad_type_0 = const()[name = tensor<string, []>("op_5151_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5151_strides_0 = const()[name = tensor<string, []>("op_5151_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5151_pad_0 = const()[name = tensor<string, []>("op_5151_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5151_dilations_0 = const()[name = tensor<string, []>("op_5151_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5151_groups_0 = const()[name = tensor<string, []>("op_5151_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_13_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(223597312))), name = tensor<string, []>("layers_13_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89435]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(223418368))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_13_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(307492672))), name = tensor<string, []>("layers_13_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89435]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(307313728))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5151_cast_fp16 = conv(dilations = var_5151_dilations_0, groups = var_5151_groups_0, pad = var_5151_pad_0, pad_type = var_5151_pad_type_0, strides = var_5151_strides_0, weight = layers_13_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_355_cast_fp16)[name = tensor<string, []>("op_5151_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_357_cast_fp16 = add(x = var_5145_cast_fp16, y = var_5151_cast_fp16)[name = tensor<string, []>("input_357_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_359_cast_fp16 = silu(x = input_357_cast_fp16)[name = tensor<string, []>("input_359_cast_fp16")];
@@ -3636,14 +3636,14 @@ program(1.0)
             tensor<int32, [4]> var_5162_pad_0 = const()[name = tensor<string, []>("op_5162_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5162_dilations_0 = const()[name = tensor<string, []>("op_5162_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5162_groups_0 = const()[name = tensor<string, []>("op_5162_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_13_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(224121664))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226218880))), name = tensor<string, []>("layers_13_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_13_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(308017024))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311162816))), name = tensor<string, []>("layers_13_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5162_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5162_dilations_0, groups = var_5162_groups_0, pad = var_5162_pad_0, pad_type = var_5162_pad_type_0, strides = var_5162_strides_0, weight = layers_13_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_359_cast_fp16)[name = tensor<string, []>("op_5162_cast_fp16")];
             tensor<string, []> var_5168_pad_type_0 = const()[name = tensor<string, []>("op_5168_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5168_strides_0 = const()[name = tensor<string, []>("op_5168_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5168_pad_0 = const()[name = tensor<string, []>("op_5168_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5168_dilations_0 = const()[name = tensor<string, []>("op_5168_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5168_groups_0 = const()[name = tensor<string, []>("op_5168_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_13_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226390848))), name = tensor<string, []>("layers_13_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [85875]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226219008))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_13_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311334848))), name = tensor<string, []>("layers_13_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [85875]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311163008))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5168_cast_fp16 = conv(dilations = var_5168_dilations_0, groups = var_5168_groups_0, pad = var_5168_pad_0, pad_type = var_5168_pad_type_0, strides = var_5168_strides_0, weight = layers_13_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_359_cast_fp16)[name = tensor<string, []>("op_5168_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_81_cast_fp16 = add(x = var_5162_cast_fp16, y = var_5168_cast_fp16)[name = tensor<string, []>("x_81_cast_fp16")];
             tensor<fp16, []> var_5170_to_fp16 = const()[name = tensor<string, []>("op_5170_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -3652,8 +3652,8 @@ program(1.0)
             tensor<int32, [1]> out_133_axes_0 = const()[name = tensor<string, []>("out_133_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5181_to_fp16 = const()[name = tensor<string, []>("op_5181_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_133_cast_fp16 = layer_norm(axes = out_133_axes_0, epsilon = var_5181_to_fp16, x = inputs_133_cast_fp16)[name = tensor<string, []>("out_133_cast_fp16")];
-            tensor<fp16, [1024]> obj_55_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_55_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226915200)))];
-            tensor<fp16, [1024]> obj_55_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_55_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226917312)))];
+            tensor<fp16, [1024]> obj_55_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_55_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311859200)))];
+            tensor<fp16, [1024]> obj_55_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_55_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311861312)))];
             tensor<fp16, []> obj_55_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_55_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_55_cast_fp16 = batch_norm(beta = obj_55_beta_0_to_fp16, epsilon = obj_55_epsilon_0_to_fp16, gamma = obj_55_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_133_cast_fp16)[name = tensor<string, []>("obj_55_cast_fp16")];
             tensor<string, []> var_5206_pad_type_0 = const()[name = tensor<string, []>("op_5206_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3661,14 +3661,14 @@ program(1.0)
             tensor<int32, [4]> var_5206_pad_0 = const()[name = tensor<string, []>("op_5206_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5206_dilations_0 = const()[name = tensor<string, []>("op_5206_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5206_groups_0 = const()[name = tensor<string, []>("op_5206_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(226919424))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227443776))), name = tensor<string, []>("layers_13_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311863424))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312649920))), name = tensor<string, []>("layers_13_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5206_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5206_dilations_0, groups = var_5206_groups_0, pad = var_5206_pad_0, pad_type = var_5206_pad_type_0, strides = var_5206_strides_0, weight = layers_13_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_55_cast_fp16)[name = tensor<string, []>("op_5206_cast_fp16")];
             tensor<string, []> var_5212_pad_type_0 = const()[name = tensor<string, []>("op_5212_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5212_strides_0 = const()[name = tensor<string, []>("op_5212_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5212_pad_0 = const()[name = tensor<string, []>("op_5212_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5212_dilations_0 = const()[name = tensor<string, []>("op_5212_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5212_groups_0 = const()[name = tensor<string, []>("op_5212_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227479296))), name = tensor<string, []>("layers_13_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17647]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227443904))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312685504))), name = tensor<string, []>("layers_13_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17647]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312650112))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5212_cast_fp16 = conv(dilations = var_5212_dilations_0, groups = var_5212_groups_0, pad = var_5212_pad_0, pad_type = var_5212_pad_type_0, strides = var_5212_strides_0, weight = layers_13_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_55_cast_fp16)[name = tensor<string, []>("op_5212_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_53_cast_fp16 = add(x = var_5206_cast_fp16, y = var_5212_cast_fp16)[name = tensor<string, []>("query_53_cast_fp16")];
             tensor<string, []> var_5221_pad_type_0 = const()[name = tensor<string, []>("op_5221_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3676,14 +3676,14 @@ program(1.0)
             tensor<int32, [4]> var_5221_pad_0 = const()[name = tensor<string, []>("op_5221_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5221_dilations_0 = const()[name = tensor<string, []>("op_5221_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5221_groups_0 = const()[name = tensor<string, []>("op_5221_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(227610432))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228134784))), name = tensor<string, []>("layers_13_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312816640))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313603136))), name = tensor<string, []>("layers_13_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5221_cast_fp16 = conv(dilations = var_5221_dilations_0, groups = var_5221_groups_0, pad = var_5221_pad_0, pad_type = var_5221_pad_type_0, strides = var_5221_strides_0, weight = layers_13_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_55_cast_fp16)[name = tensor<string, []>("op_5221_cast_fp16")];
             tensor<string, []> var_5227_pad_type_0 = const()[name = tensor<string, []>("op_5227_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5227_strides_0 = const()[name = tensor<string, []>("op_5227_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5227_pad_0 = const()[name = tensor<string, []>("op_5227_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5227_dilations_0 = const()[name = tensor<string, []>("op_5227_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5227_groups_0 = const()[name = tensor<string, []>("op_5227_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228174336))), name = tensor<string, []>("layers_13_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19670]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228134912))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313642752))), name = tensor<string, []>("layers_13_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19670]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313603328))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5227_cast_fp16 = conv(dilations = var_5227_dilations_0, groups = var_5227_groups_0, pad = var_5227_pad_0, pad_type = var_5227_pad_type_0, strides = var_5227_strides_0, weight = layers_13_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_55_cast_fp16)[name = tensor<string, []>("op_5227_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_27_cast_fp16 = add(x = var_5221_cast_fp16, y = var_5227_cast_fp16)[name = tensor<string, []>("key_27_cast_fp16")];
             tensor<string, []> var_5237_pad_type_0 = const()[name = tensor<string, []>("op_5237_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3691,33 +3691,33 @@ program(1.0)
             tensor<int32, [4]> var_5237_pad_0 = const()[name = tensor<string, []>("op_5237_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5237_dilations_0 = const()[name = tensor<string, []>("op_5237_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5237_groups_0 = const()[name = tensor<string, []>("op_5237_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228305472))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228829824))), name = tensor<string, []>("layers_13_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313773888))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(314560384))), name = tensor<string, []>("layers_13_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5237_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5237_dilations_0, groups = var_5237_groups_0, pad = var_5237_pad_0, pad_type = var_5237_pad_type_0, strides = var_5237_strides_0, weight = layers_13_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_55_cast_fp16)[name = tensor<string, []>("op_5237_cast_fp16")];
             tensor<string, []> var_5243_pad_type_0 = const()[name = tensor<string, []>("op_5243_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5243_strides_0 = const()[name = tensor<string, []>("op_5243_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5243_pad_0 = const()[name = tensor<string, []>("op_5243_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5243_dilations_0 = const()[name = tensor<string, []>("op_5243_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5243_groups_0 = const()[name = tensor<string, []>("op_5243_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228861120))), name = tensor<string, []>("layers_13_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15549]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228829952))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(314591744))), name = tensor<string, []>("layers_13_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15549]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(314560576))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5243_cast_fp16 = conv(dilations = var_5243_dilations_0, groups = var_5243_groups_0, pad = var_5243_pad_0, pad_type = var_5243_pad_type_0, strides = var_5243_strides_0, weight = layers_13_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_55_cast_fp16)[name = tensor<string, []>("op_5243_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_27_cast_fp16 = add(x = var_5237_cast_fp16, y = var_5243_cast_fp16)[name = tensor<string, []>("value_27_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_5246_to_fp16 = const()[name = tensor<string, []>("op_5246_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228992256)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_5246_to_fp16 = const()[name = tensor<string, []>("op_5246_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(314722880)))];
             tensor<fp16, [1, 1024, 1, 188]> query_55_cast_fp16 = add(x = query_53_cast_fp16, y = var_5246_to_fp16)[name = tensor<string, []>("query_55_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_5249_to_fp16 = const()[name = tensor<string, []>("op_5249_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228994368)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_5249_to_fp16 = const()[name = tensor<string, []>("op_5249_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(314724992)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_27_cast_fp16 = add(x = query_53_cast_fp16, y = var_5249_to_fp16)[name = tensor<string, []>("q_with_bias_v_27_cast_fp16")];
             tensor<string, []> var_5259_pad_type_0 = const()[name = tensor<string, []>("op_5259_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5259_strides_0 = const()[name = tensor<string, []>("op_5259_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5259_pad_0 = const()[name = tensor<string, []>("op_5259_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5259_dilations_0 = const()[name = tensor<string, []>("op_5259_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5259_groups_0 = const()[name = tensor<string, []>("op_5259_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(228996480))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(229520832))), name = tensor<string, []>("layers_13_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(314727104))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315513600))), name = tensor<string, []>("layers_13_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_5259_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5259_dilations_0, groups = var_5259_groups_0, pad = var_5259_pad_0, pad_type = var_5259_pad_type_0, strides = var_5259_strides_0, weight = layers_13_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_5259_cast_fp16")];
             tensor<string, []> var_5265_pad_type_0 = const()[name = tensor<string, []>("op_5265_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5265_strides_0 = const()[name = tensor<string, []>("op_5265_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5265_pad_0 = const()[name = tensor<string, []>("op_5265_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5265_dilations_0 = const()[name = tensor<string, []>("op_5265_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5265_groups_0 = const()[name = tensor<string, []>("op_5265_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(229588608))), name = tensor<string, []>("layers_13_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33792]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(229520960))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315581440))), name = tensor<string, []>("layers_13_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33792]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315513792))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_5265_cast_fp16 = conv(dilations = var_5265_dilations_0, groups = var_5265_groups_0, pad = var_5265_pad_0, pad_type = var_5265_pad_type_0, strides = var_5265_strides_0, weight = layers_13_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_5265_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_27_cast_fp16 = add(x = var_5259_cast_fp16, y = var_5265_cast_fp16)[name = tensor<string, []>("p_27_cast_fp16")];
             tensor<int32, [4]> var_5269 = const()[name = tensor<string, []>("op_5269"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -3768,22 +3768,22 @@ program(1.0)
             tensor<int32, [4]> var_5322_pad_0 = const()[name = tensor<string, []>("op_5322_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5322_dilations_0 = const()[name = tensor<string, []>("op_5322_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5322_groups_0 = const()[name = tensor<string, []>("op_5322_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(229719744))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(230244096))), name = tensor<string, []>("layers_13_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315712576))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(316499072))), name = tensor<string, []>("layers_13_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5322_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5322_dilations_0, groups = var_5322_groups_0, pad = var_5322_pad_0, pad_type = var_5322_pad_type_0, strides = var_5322_strides_0, weight = layers_13_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_361_cast_fp16)[name = tensor<string, []>("op_5322_cast_fp16")];
             tensor<string, []> var_5328_pad_type_0 = const()[name = tensor<string, []>("op_5328_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5328_strides_0 = const()[name = tensor<string, []>("op_5328_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5328_pad_0 = const()[name = tensor<string, []>("op_5328_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5328_dilations_0 = const()[name = tensor<string, []>("op_5328_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5328_groups_0 = const()[name = tensor<string, []>("op_5328_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(230275968))), name = tensor<string, []>("layers_13_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15839]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(230244224))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(316531008))), name = tensor<string, []>("layers_13_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15839]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(316499264))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5328_cast_fp16 = conv(dilations = var_5328_dilations_0, groups = var_5328_groups_0, pad = var_5328_pad_0, pad_type = var_5328_pad_type_0, strides = var_5328_strides_0, weight = layers_13_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_361_cast_fp16)[name = tensor<string, []>("op_5328_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_57_cast_fp16 = add(x = var_5322_cast_fp16, y = var_5328_cast_fp16)[name = tensor<string, []>("obj_57_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_135_cast_fp16 = add(x = inputs_133_cast_fp16, y = obj_57_cast_fp16)[name = tensor<string, []>("inputs_135_cast_fp16")];
             tensor<int32, [1]> out_135_axes_0 = const()[name = tensor<string, []>("out_135_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5339_to_fp16 = const()[name = tensor<string, []>("op_5339_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_135_cast_fp16 = layer_norm(axes = out_135_axes_0, epsilon = var_5339_to_fp16, x = inputs_135_cast_fp16)[name = tensor<string, []>("out_135_cast_fp16")];
-            tensor<fp16, [1024]> input_363_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_363_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(230407104)))];
-            tensor<fp16, [1024]> input_363_beta_0_to_fp16 = const()[name = tensor<string, []>("input_363_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(230409216)))];
+            tensor<fp16, [1024]> input_363_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_363_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(316662144)))];
+            tensor<fp16, [1024]> input_363_beta_0_to_fp16 = const()[name = tensor<string, []>("input_363_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(316664256)))];
             tensor<fp16, []> input_363_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_363_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_363_cast_fp16 = batch_norm(beta = input_363_beta_0_to_fp16, epsilon = input_363_epsilon_0_to_fp16, gamma = input_363_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_135_cast_fp16)[name = tensor<string, []>("input_363_cast_fp16")];
             tensor<string, []> var_5360_pad_type_0 = const()[name = tensor<string, []>("op_5360_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3791,14 +3791,14 @@ program(1.0)
             tensor<int32, [4]> var_5360_pad_0 = const()[name = tensor<string, []>("op_5360_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5360_dilations_0 = const()[name = tensor<string, []>("op_5360_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5360_groups_0 = const()[name = tensor<string, []>("op_5360_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_13_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(230411328))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(231459968))), name = tensor<string, []>("layers_13_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_13_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(316666368))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318239296))), name = tensor<string, []>("layers_13_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_5360_cast_fp16 = conv(dilations = var_5360_dilations_0, groups = var_5360_groups_0, pad = var_5360_pad_0, pad_type = var_5360_pad_type_0, strides = var_5360_strides_0, weight = layers_13_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_363_cast_fp16)[name = tensor<string, []>("op_5360_cast_fp16")];
             tensor<string, []> var_5366_pad_type_0 = const()[name = tensor<string, []>("op_5366_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5366_strides_0 = const()[name = tensor<string, []>("op_5366_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5366_pad_0 = const()[name = tensor<string, []>("op_5366_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5366_dilations_0 = const()[name = tensor<string, []>("op_5366_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5366_groups_0 = const()[name = tensor<string, []>("op_5366_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_13_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(231522496))), name = tensor<string, []>("layers_13_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31158]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(231460096))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_13_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318301888))), name = tensor<string, []>("layers_13_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31158]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318239488))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_5366_cast_fp16 = conv(dilations = var_5366_dilations_0, groups = var_5366_groups_0, pad = var_5366_pad_0, pad_type = var_5366_pad_type_0, strides = var_5366_strides_0, weight = layers_13_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_363_cast_fp16)[name = tensor<string, []>("op_5366_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_365_cast_fp16 = add(x = var_5360_cast_fp16, y = var_5366_cast_fp16)[name = tensor<string, []>("input_365_cast_fp16")];
             tensor<int32, []> input_367_split_num_splits_0 = const()[name = tensor<string, []>("input_367_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -3811,8 +3811,8 @@ program(1.0)
             tensor<int32, []> input_369_groups_0 = const()[name = tensor<string, []>("input_369_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_369_strides_0 = const()[name = tensor<string, []>("input_369_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_369_dilations_0 = const()[name = tensor<string, []>("input_369_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_294_to_fp16 = const()[name = tensor<string, []>("const_294_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(231784704)))];
-            tensor<fp16, [1024]> const_295_to_fp16 = const()[name = tensor<string, []>("const_295_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(231803200)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_294_to_fp16 = const()[name = tensor<string, []>("const_294_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318564096)))];
+            tensor<fp16, [1024]> const_295_to_fp16 = const()[name = tensor<string, []>("const_295_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318582592)))];
             tensor<fp16, [1, 1024, 1, 188]> input_371_cast_fp16 = conv(bias = const_295_to_fp16, dilations = input_369_dilations_0, groups = input_369_groups_0, pad = input_369_pad_0, pad_type = input_369_pad_type_0, strides = input_369_strides_0, weight = const_294_to_fp16, x = input_367_cast_fp16)[name = tensor<string, []>("input_371_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_373_cast_fp16 = silu(x = input_371_cast_fp16)[name = tensor<string, []>("input_373_cast_fp16")];
             tensor<string, []> var_5388_pad_type_0 = const()[name = tensor<string, []>("op_5388_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3820,22 +3820,22 @@ program(1.0)
             tensor<int32, [4]> var_5388_pad_0 = const()[name = tensor<string, []>("op_5388_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5388_dilations_0 = const()[name = tensor<string, []>("op_5388_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5388_groups_0 = const()[name = tensor<string, []>("op_5388_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(231805312))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232329664))), name = tensor<string, []>("layers_13_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318584704))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(319371200))), name = tensor<string, []>("layers_13_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5388_cast_fp16 = conv(dilations = var_5388_dilations_0, groups = var_5388_groups_0, pad = var_5388_pad_0, pad_type = var_5388_pad_type_0, strides = var_5388_strides_0, weight = layers_13_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_373_cast_fp16)[name = tensor<string, []>("op_5388_cast_fp16")];
             tensor<string, []> var_5394_pad_type_0 = const()[name = tensor<string, []>("op_5394_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5394_strides_0 = const()[name = tensor<string, []>("op_5394_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5394_pad_0 = const()[name = tensor<string, []>("op_5394_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5394_dilations_0 = const()[name = tensor<string, []>("op_5394_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5394_groups_0 = const()[name = tensor<string, []>("op_5394_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_13_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232361472))), name = tensor<string, []>("layers_13_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15778]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232329792))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_13_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(319403072))), name = tensor<string, []>("layers_13_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15778]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(319371392))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5394_cast_fp16 = conv(dilations = var_5394_dilations_0, groups = var_5394_groups_0, pad = var_5394_pad_0, pad_type = var_5394_pad_type_0, strides = var_5394_strides_0, weight = layers_13_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_373_cast_fp16)[name = tensor<string, []>("op_5394_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_83_cast_fp16 = add(x = var_5388_cast_fp16, y = var_5394_cast_fp16)[name = tensor<string, []>("x_83_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_137_cast_fp16 = add(x = inputs_135_cast_fp16, y = x_83_cast_fp16)[name = tensor<string, []>("inputs_137_cast_fp16")];
             tensor<int32, [1]> out_137_axes_0 = const()[name = tensor<string, []>("out_137_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5405_to_fp16 = const()[name = tensor<string, []>("op_5405_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_137_cast_fp16 = layer_norm(axes = out_137_axes_0, epsilon = var_5405_to_fp16, x = inputs_137_cast_fp16)[name = tensor<string, []>("out_137_cast_fp16")];
-            tensor<fp16, [1024]> input_375_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_375_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232492608)))];
-            tensor<fp16, [1024]> input_375_beta_0_to_fp16 = const()[name = tensor<string, []>("input_375_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232494720)))];
+            tensor<fp16, [1024]> input_375_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_375_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(319534208)))];
+            tensor<fp16, [1024]> input_375_beta_0_to_fp16 = const()[name = tensor<string, []>("input_375_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(319536320)))];
             tensor<fp16, []> input_375_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_375_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_375_cast_fp16 = batch_norm(beta = input_375_beta_0_to_fp16, epsilon = input_375_epsilon_0_to_fp16, gamma = input_375_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_137_cast_fp16)[name = tensor<string, []>("input_375_cast_fp16")];
             tensor<string, []> var_5425_pad_type_0 = const()[name = tensor<string, []>("op_5425_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3843,14 +3843,14 @@ program(1.0)
             tensor<int32, [4]> var_5425_pad_0 = const()[name = tensor<string, []>("op_5425_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5425_dilations_0 = const()[name = tensor<string, []>("op_5425_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5425_groups_0 = const()[name = tensor<string, []>("op_5425_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_13_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(232496832))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(234594048))), name = tensor<string, []>("layers_13_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_13_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(319538432))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(322684224))), name = tensor<string, []>("layers_13_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5425_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_5425_dilations_0, groups = var_5425_groups_0, pad = var_5425_pad_0, pad_type = var_5425_pad_type_0, strides = var_5425_strides_0, weight = layers_13_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_375_cast_fp16)[name = tensor<string, []>("op_5425_cast_fp16")];
             tensor<string, []> var_5431_pad_type_0 = const()[name = tensor<string, []>("op_5431_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5431_strides_0 = const()[name = tensor<string, []>("op_5431_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5431_pad_0 = const()[name = tensor<string, []>("op_5431_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5431_dilations_0 = const()[name = tensor<string, []>("op_5431_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5431_groups_0 = const()[name = tensor<string, []>("op_5431_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_13_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(234748992))), name = tensor<string, []>("layers_13_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [77365]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(234594176))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_13_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(322839232))), name = tensor<string, []>("layers_13_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [77365]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(322684416))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5431_cast_fp16 = conv(dilations = var_5431_dilations_0, groups = var_5431_groups_0, pad = var_5431_pad_0, pad_type = var_5431_pad_type_0, strides = var_5431_strides_0, weight = layers_13_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_375_cast_fp16)[name = tensor<string, []>("op_5431_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_377_cast_fp16 = add(x = var_5425_cast_fp16, y = var_5431_cast_fp16)[name = tensor<string, []>("input_377_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_379_cast_fp16 = silu(x = input_377_cast_fp16)[name = tensor<string, []>("input_379_cast_fp16")];
@@ -3859,14 +3859,14 @@ program(1.0)
             tensor<int32, [4]> var_5442_pad_0 = const()[name = tensor<string, []>("op_5442_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5442_dilations_0 = const()[name = tensor<string, []>("op_5442_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5442_groups_0 = const()[name = tensor<string, []>("op_5442_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_13_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(235273344))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(237370560))), name = tensor<string, []>("layers_13_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_13_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(323363584))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(326509376))), name = tensor<string, []>("layers_13_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5442_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5442_dilations_0, groups = var_5442_groups_0, pad = var_5442_pad_0, pad_type = var_5442_pad_type_0, strides = var_5442_strides_0, weight = layers_13_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_379_cast_fp16)[name = tensor<string, []>("op_5442_cast_fp16")];
             tensor<string, []> var_5448_pad_type_0 = const()[name = tensor<string, []>("op_5448_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5448_strides_0 = const()[name = tensor<string, []>("op_5448_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5448_pad_0 = const()[name = tensor<string, []>("op_5448_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5448_dilations_0 = const()[name = tensor<string, []>("op_5448_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5448_groups_0 = const()[name = tensor<string, []>("op_5448_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_13_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(237535424))), name = tensor<string, []>("layers_13_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82329]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(237370688))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_13_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(326674304))), name = tensor<string, []>("layers_13_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82329]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(326509568))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5448_cast_fp16 = conv(dilations = var_5448_dilations_0, groups = var_5448_groups_0, pad = var_5448_pad_0, pad_type = var_5448_pad_type_0, strides = var_5448_strides_0, weight = layers_13_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_379_cast_fp16)[name = tensor<string, []>("op_5448_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_85_cast_fp16 = add(x = var_5442_cast_fp16, y = var_5448_cast_fp16)[name = tensor<string, []>("x_85_cast_fp16")];
             tensor<fp16, []> var_5450_to_fp16 = const()[name = tensor<string, []>("op_5450_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -3875,16 +3875,16 @@ program(1.0)
             tensor<int32, [1]> out_139_axes_0 = const()[name = tensor<string, []>("out_139_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5461_to_fp16 = const()[name = tensor<string, []>("op_5461_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_139_cast_fp16 = layer_norm(axes = out_139_axes_0, epsilon = var_5461_to_fp16, x = inputs_139_cast_fp16)[name = tensor<string, []>("out_139_cast_fp16")];
-            tensor<fp16, [1024]> inputs_141_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_141_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(238059776)))];
-            tensor<fp16, [1024]> inputs_141_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_141_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(238061888)))];
+            tensor<fp16, [1024]> inputs_141_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_141_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(327198656)))];
+            tensor<fp16, [1024]> inputs_141_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_141_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(327200768)))];
             tensor<fp16, []> inputs_141_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_141_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_141_cast_fp16 = batch_norm(beta = inputs_141_beta_0_to_fp16, epsilon = inputs_141_epsilon_0_to_fp16, gamma = inputs_141_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_139_cast_fp16)[name = tensor<string, []>("inputs_141_cast_fp16")];
             tensor<int32, []> var_5475 = const()[name = tensor<string, []>("op_5475"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_141_axes_0 = const()[name = tensor<string, []>("out_141_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5506_to_fp16 = const()[name = tensor<string, []>("op_5506_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_141_cast_fp16 = layer_norm(axes = out_141_axes_0, epsilon = var_5506_to_fp16, x = inputs_141_cast_fp16)[name = tensor<string, []>("out_141_cast_fp16")];
-            tensor<fp16, [1024]> input_381_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_381_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(238064000)))];
-            tensor<fp16, [1024]> input_381_beta_0_to_fp16 = const()[name = tensor<string, []>("input_381_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(238066112)))];
+            tensor<fp16, [1024]> input_381_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_381_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(327202880)))];
+            tensor<fp16, [1024]> input_381_beta_0_to_fp16 = const()[name = tensor<string, []>("input_381_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(327204992)))];
             tensor<fp16, []> input_381_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_381_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_381_cast_fp16 = batch_norm(beta = input_381_beta_0_to_fp16, epsilon = input_381_epsilon_0_to_fp16, gamma = input_381_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_141_cast_fp16)[name = tensor<string, []>("input_381_cast_fp16")];
             tensor<string, []> var_5526_pad_type_0 = const()[name = tensor<string, []>("op_5526_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3892,14 +3892,14 @@ program(1.0)
             tensor<int32, [4]> var_5526_pad_0 = const()[name = tensor<string, []>("op_5526_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5526_dilations_0 = const()[name = tensor<string, []>("op_5526_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5526_groups_0 = const()[name = tensor<string, []>("op_5526_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_14_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(238068224))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(240165440))), name = tensor<string, []>("layers_14_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_14_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(327207104))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(330352896))), name = tensor<string, []>("layers_14_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5526_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_5526_dilations_0, groups = var_5526_groups_0, pad = var_5526_pad_0, pad_type = var_5526_pad_type_0, strides = var_5526_strides_0, weight = layers_14_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_381_cast_fp16)[name = tensor<string, []>("op_5526_cast_fp16")];
             tensor<string, []> var_5532_pad_type_0 = const()[name = tensor<string, []>("op_5532_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5532_strides_0 = const()[name = tensor<string, []>("op_5532_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5532_pad_0 = const()[name = tensor<string, []>("op_5532_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5532_dilations_0 = const()[name = tensor<string, []>("op_5532_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5532_groups_0 = const()[name = tensor<string, []>("op_5532_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_14_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(240344832))), name = tensor<string, []>("layers_14_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89590]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(240165568))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_14_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(330532352))), name = tensor<string, []>("layers_14_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89590]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(330353088))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5532_cast_fp16 = conv(dilations = var_5532_dilations_0, groups = var_5532_groups_0, pad = var_5532_pad_0, pad_type = var_5532_pad_type_0, strides = var_5532_strides_0, weight = layers_14_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_381_cast_fp16)[name = tensor<string, []>("op_5532_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_383_cast_fp16 = add(x = var_5526_cast_fp16, y = var_5532_cast_fp16)[name = tensor<string, []>("input_383_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_385_cast_fp16 = silu(x = input_383_cast_fp16)[name = tensor<string, []>("input_385_cast_fp16")];
@@ -3908,14 +3908,14 @@ program(1.0)
             tensor<int32, [4]> var_5543_pad_0 = const()[name = tensor<string, []>("op_5543_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5543_dilations_0 = const()[name = tensor<string, []>("op_5543_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5543_groups_0 = const()[name = tensor<string, []>("op_5543_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_14_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(240869184))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(242966400))), name = tensor<string, []>("layers_14_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_14_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(331056704))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(334202496))), name = tensor<string, []>("layers_14_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5543_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5543_dilations_0, groups = var_5543_groups_0, pad = var_5543_pad_0, pad_type = var_5543_pad_type_0, strides = var_5543_strides_0, weight = layers_14_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_385_cast_fp16)[name = tensor<string, []>("op_5543_cast_fp16")];
             tensor<string, []> var_5549_pad_type_0 = const()[name = tensor<string, []>("op_5549_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5549_strides_0 = const()[name = tensor<string, []>("op_5549_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5549_pad_0 = const()[name = tensor<string, []>("op_5549_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5549_dilations_0 = const()[name = tensor<string, []>("op_5549_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5549_groups_0 = const()[name = tensor<string, []>("op_5549_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_14_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(243142272))), name = tensor<string, []>("layers_14_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [87829]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(242966528))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_14_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(334378432))), name = tensor<string, []>("layers_14_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [87829]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(334202688))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5549_cast_fp16 = conv(dilations = var_5549_dilations_0, groups = var_5549_groups_0, pad = var_5549_pad_0, pad_type = var_5549_pad_type_0, strides = var_5549_strides_0, weight = layers_14_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_385_cast_fp16)[name = tensor<string, []>("op_5549_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_87_cast_fp16 = add(x = var_5543_cast_fp16, y = var_5549_cast_fp16)[name = tensor<string, []>("x_87_cast_fp16")];
             tensor<fp16, []> var_5551_to_fp16 = const()[name = tensor<string, []>("op_5551_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -3924,8 +3924,8 @@ program(1.0)
             tensor<int32, [1]> out_143_axes_0 = const()[name = tensor<string, []>("out_143_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5562_to_fp16 = const()[name = tensor<string, []>("op_5562_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_143_cast_fp16 = layer_norm(axes = out_143_axes_0, epsilon = var_5562_to_fp16, x = inputs_143_cast_fp16)[name = tensor<string, []>("out_143_cast_fp16")];
-            tensor<fp16, [1024]> obj_59_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_59_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(243666624)))];
-            tensor<fp16, [1024]> obj_59_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_59_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(243668736)))];
+            tensor<fp16, [1024]> obj_59_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_59_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(334902784)))];
+            tensor<fp16, [1024]> obj_59_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_59_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(334904896)))];
             tensor<fp16, []> obj_59_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_59_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_59_cast_fp16 = batch_norm(beta = obj_59_beta_0_to_fp16, epsilon = obj_59_epsilon_0_to_fp16, gamma = obj_59_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_143_cast_fp16)[name = tensor<string, []>("obj_59_cast_fp16")];
             tensor<string, []> var_5587_pad_type_0 = const()[name = tensor<string, []>("op_5587_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3933,14 +3933,14 @@ program(1.0)
             tensor<int32, [4]> var_5587_pad_0 = const()[name = tensor<string, []>("op_5587_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5587_dilations_0 = const()[name = tensor<string, []>("op_5587_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5587_groups_0 = const()[name = tensor<string, []>("op_5587_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(243670848))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244195200))), name = tensor<string, []>("layers_14_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(334907008))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(335693504))), name = tensor<string, []>("layers_14_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5587_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5587_dilations_0, groups = var_5587_groups_0, pad = var_5587_pad_0, pad_type = var_5587_pad_type_0, strides = var_5587_strides_0, weight = layers_14_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_59_cast_fp16)[name = tensor<string, []>("op_5587_cast_fp16")];
             tensor<string, []> var_5593_pad_type_0 = const()[name = tensor<string, []>("op_5593_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5593_strides_0 = const()[name = tensor<string, []>("op_5593_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5593_pad_0 = const()[name = tensor<string, []>("op_5593_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5593_dilations_0 = const()[name = tensor<string, []>("op_5593_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5593_groups_0 = const()[name = tensor<string, []>("op_5593_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244228928))), name = tensor<string, []>("layers_14_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16746]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244195328))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(335727296))), name = tensor<string, []>("layers_14_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16746]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(335693696))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5593_cast_fp16 = conv(dilations = var_5593_dilations_0, groups = var_5593_groups_0, pad = var_5593_pad_0, pad_type = var_5593_pad_type_0, strides = var_5593_strides_0, weight = layers_14_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_59_cast_fp16)[name = tensor<string, []>("op_5593_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_57_cast_fp16 = add(x = var_5587_cast_fp16, y = var_5593_cast_fp16)[name = tensor<string, []>("query_57_cast_fp16")];
             tensor<string, []> var_5602_pad_type_0 = const()[name = tensor<string, []>("op_5602_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3948,14 +3948,14 @@ program(1.0)
             tensor<int32, [4]> var_5602_pad_0 = const()[name = tensor<string, []>("op_5602_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5602_dilations_0 = const()[name = tensor<string, []>("op_5602_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5602_groups_0 = const()[name = tensor<string, []>("op_5602_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244360064))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244884416))), name = tensor<string, []>("layers_14_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(335858432))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(336644928))), name = tensor<string, []>("layers_14_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5602_cast_fp16 = conv(dilations = var_5602_dilations_0, groups = var_5602_groups_0, pad = var_5602_pad_0, pad_type = var_5602_pad_type_0, strides = var_5602_strides_0, weight = layers_14_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_59_cast_fp16)[name = tensor<string, []>("op_5602_cast_fp16")];
             tensor<string, []> var_5608_pad_type_0 = const()[name = tensor<string, []>("op_5608_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5608_strides_0 = const()[name = tensor<string, []>("op_5608_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5608_pad_0 = const()[name = tensor<string, []>("op_5608_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5608_dilations_0 = const()[name = tensor<string, []>("op_5608_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5608_groups_0 = const()[name = tensor<string, []>("op_5608_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244923392))), name = tensor<string, []>("layers_14_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19386]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(244884544))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(336683968))), name = tensor<string, []>("layers_14_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19386]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(336645120))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5608_cast_fp16 = conv(dilations = var_5608_dilations_0, groups = var_5608_groups_0, pad = var_5608_pad_0, pad_type = var_5608_pad_type_0, strides = var_5608_strides_0, weight = layers_14_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_59_cast_fp16)[name = tensor<string, []>("op_5608_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_29_cast_fp16 = add(x = var_5602_cast_fp16, y = var_5608_cast_fp16)[name = tensor<string, []>("key_29_cast_fp16")];
             tensor<string, []> var_5618_pad_type_0 = const()[name = tensor<string, []>("op_5618_pad_type_0"), val = tensor<string, []>("valid")];
@@ -3963,33 +3963,33 @@ program(1.0)
             tensor<int32, [4]> var_5618_pad_0 = const()[name = tensor<string, []>("op_5618_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5618_dilations_0 = const()[name = tensor<string, []>("op_5618_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5618_groups_0 = const()[name = tensor<string, []>("op_5618_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245054528))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245578880))), name = tensor<string, []>("layers_14_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(336815104))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(337601600))), name = tensor<string, []>("layers_14_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5618_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5618_dilations_0, groups = var_5618_groups_0, pad = var_5618_pad_0, pad_type = var_5618_pad_type_0, strides = var_5618_strides_0, weight = layers_14_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_59_cast_fp16)[name = tensor<string, []>("op_5618_cast_fp16")];
             tensor<string, []> var_5624_pad_type_0 = const()[name = tensor<string, []>("op_5624_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5624_strides_0 = const()[name = tensor<string, []>("op_5624_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5624_pad_0 = const()[name = tensor<string, []>("op_5624_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5624_dilations_0 = const()[name = tensor<string, []>("op_5624_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5624_groups_0 = const()[name = tensor<string, []>("op_5624_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245611776))), name = tensor<string, []>("layers_14_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16343]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245579008))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(337634560))), name = tensor<string, []>("layers_14_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16343]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(337601792))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5624_cast_fp16 = conv(dilations = var_5624_dilations_0, groups = var_5624_groups_0, pad = var_5624_pad_0, pad_type = var_5624_pad_type_0, strides = var_5624_strides_0, weight = layers_14_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_59_cast_fp16)[name = tensor<string, []>("op_5624_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_29_cast_fp16 = add(x = var_5618_cast_fp16, y = var_5624_cast_fp16)[name = tensor<string, []>("value_29_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_5627_to_fp16 = const()[name = tensor<string, []>("op_5627_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245742912)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_5627_to_fp16 = const()[name = tensor<string, []>("op_5627_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(337765696)))];
             tensor<fp16, [1, 1024, 1, 188]> query_59_cast_fp16 = add(x = query_57_cast_fp16, y = var_5627_to_fp16)[name = tensor<string, []>("query_59_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_5630_to_fp16 = const()[name = tensor<string, []>("op_5630_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245745024)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_5630_to_fp16 = const()[name = tensor<string, []>("op_5630_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(337767808)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_29_cast_fp16 = add(x = query_57_cast_fp16, y = var_5630_to_fp16)[name = tensor<string, []>("q_with_bias_v_29_cast_fp16")];
             tensor<string, []> var_5640_pad_type_0 = const()[name = tensor<string, []>("op_5640_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5640_strides_0 = const()[name = tensor<string, []>("op_5640_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5640_pad_0 = const()[name = tensor<string, []>("op_5640_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5640_dilations_0 = const()[name = tensor<string, []>("op_5640_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5640_groups_0 = const()[name = tensor<string, []>("op_5640_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(245747136))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246271488))), name = tensor<string, []>("layers_14_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(337769920))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(338556416))), name = tensor<string, []>("layers_14_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_5640_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5640_dilations_0, groups = var_5640_groups_0, pad = var_5640_pad_0, pad_type = var_5640_pad_type_0, strides = var_5640_strides_0, weight = layers_14_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_5640_cast_fp16")];
             tensor<string, []> var_5646_pad_type_0 = const()[name = tensor<string, []>("op_5646_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5646_strides_0 = const()[name = tensor<string, []>("op_5646_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5646_pad_0 = const()[name = tensor<string, []>("op_5646_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5646_dilations_0 = const()[name = tensor<string, []>("op_5646_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5646_groups_0 = const()[name = tensor<string, []>("op_5646_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246336832))), name = tensor<string, []>("layers_14_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32557]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246271616))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(338621824))), name = tensor<string, []>("layers_14_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32557]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(338556608))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_5646_cast_fp16 = conv(dilations = var_5646_dilations_0, groups = var_5646_groups_0, pad = var_5646_pad_0, pad_type = var_5646_pad_type_0, strides = var_5646_strides_0, weight = layers_14_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_5646_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_29_cast_fp16 = add(x = var_5640_cast_fp16, y = var_5646_cast_fp16)[name = tensor<string, []>("p_29_cast_fp16")];
             tensor<int32, [4]> var_5650 = const()[name = tensor<string, []>("op_5650"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -4040,22 +4040,22 @@ program(1.0)
             tensor<int32, [4]> var_5703_pad_0 = const()[name = tensor<string, []>("op_5703_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5703_dilations_0 = const()[name = tensor<string, []>("op_5703_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5703_groups_0 = const()[name = tensor<string, []>("op_5703_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246467968))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246992320))), name = tensor<string, []>("layers_14_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(338752960))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(339539456))), name = tensor<string, []>("layers_14_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5703_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5703_dilations_0, groups = var_5703_groups_0, pad = var_5703_pad_0, pad_type = var_5703_pad_type_0, strides = var_5703_strides_0, weight = layers_14_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_387_cast_fp16)[name = tensor<string, []>("op_5703_cast_fp16")];
             tensor<string, []> var_5709_pad_type_0 = const()[name = tensor<string, []>("op_5709_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5709_strides_0 = const()[name = tensor<string, []>("op_5709_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5709_pad_0 = const()[name = tensor<string, []>("op_5709_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5709_dilations_0 = const()[name = tensor<string, []>("op_5709_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5709_groups_0 = const()[name = tensor<string, []>("op_5709_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247025920))), name = tensor<string, []>("layers_14_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16687]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(246992448))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(339573120))), name = tensor<string, []>("layers_14_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16687]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(339539648))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5709_cast_fp16 = conv(dilations = var_5709_dilations_0, groups = var_5709_groups_0, pad = var_5709_pad_0, pad_type = var_5709_pad_type_0, strides = var_5709_strides_0, weight = layers_14_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_387_cast_fp16)[name = tensor<string, []>("op_5709_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_61_cast_fp16 = add(x = var_5703_cast_fp16, y = var_5709_cast_fp16)[name = tensor<string, []>("obj_61_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_145_cast_fp16 = add(x = inputs_143_cast_fp16, y = obj_61_cast_fp16)[name = tensor<string, []>("inputs_145_cast_fp16")];
             tensor<int32, [1]> out_145_axes_0 = const()[name = tensor<string, []>("out_145_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5720_to_fp16 = const()[name = tensor<string, []>("op_5720_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_145_cast_fp16 = layer_norm(axes = out_145_axes_0, epsilon = var_5720_to_fp16, x = inputs_145_cast_fp16)[name = tensor<string, []>("out_145_cast_fp16")];
-            tensor<fp16, [1024]> input_389_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_389_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247157056)))];
-            tensor<fp16, [1024]> input_389_beta_0_to_fp16 = const()[name = tensor<string, []>("input_389_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247159168)))];
+            tensor<fp16, [1024]> input_389_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_389_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(339704256)))];
+            tensor<fp16, [1024]> input_389_beta_0_to_fp16 = const()[name = tensor<string, []>("input_389_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(339706368)))];
             tensor<fp16, []> input_389_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_389_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_389_cast_fp16 = batch_norm(beta = input_389_beta_0_to_fp16, epsilon = input_389_epsilon_0_to_fp16, gamma = input_389_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_145_cast_fp16)[name = tensor<string, []>("input_389_cast_fp16")];
             tensor<string, []> var_5741_pad_type_0 = const()[name = tensor<string, []>("op_5741_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4063,14 +4063,14 @@ program(1.0)
             tensor<int32, [4]> var_5741_pad_0 = const()[name = tensor<string, []>("op_5741_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5741_dilations_0 = const()[name = tensor<string, []>("op_5741_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5741_groups_0 = const()[name = tensor<string, []>("op_5741_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_14_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(247161280))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(248209920))), name = tensor<string, []>("layers_14_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_14_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(339708480))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(341281408))), name = tensor<string, []>("layers_14_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_5741_cast_fp16 = conv(dilations = var_5741_dilations_0, groups = var_5741_groups_0, pad = var_5741_pad_0, pad_type = var_5741_pad_type_0, strides = var_5741_strides_0, weight = layers_14_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_389_cast_fp16)[name = tensor<string, []>("op_5741_cast_fp16")];
             tensor<string, []> var_5747_pad_type_0 = const()[name = tensor<string, []>("op_5747_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5747_strides_0 = const()[name = tensor<string, []>("op_5747_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5747_pad_0 = const()[name = tensor<string, []>("op_5747_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5747_dilations_0 = const()[name = tensor<string, []>("op_5747_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5747_groups_0 = const()[name = tensor<string, []>("op_5747_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_14_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(248272960))), name = tensor<string, []>("layers_14_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31416]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(248210048))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_14_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(341344512))), name = tensor<string, []>("layers_14_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31416]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(341281600))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_5747_cast_fp16 = conv(dilations = var_5747_dilations_0, groups = var_5747_groups_0, pad = var_5747_pad_0, pad_type = var_5747_pad_type_0, strides = var_5747_strides_0, weight = layers_14_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_389_cast_fp16)[name = tensor<string, []>("op_5747_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_391_cast_fp16 = add(x = var_5741_cast_fp16, y = var_5747_cast_fp16)[name = tensor<string, []>("input_391_cast_fp16")];
             tensor<int32, []> input_393_split_num_splits_0 = const()[name = tensor<string, []>("input_393_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -4083,8 +4083,8 @@ program(1.0)
             tensor<int32, []> input_395_groups_0 = const()[name = tensor<string, []>("input_395_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_395_strides_0 = const()[name = tensor<string, []>("input_395_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_395_dilations_0 = const()[name = tensor<string, []>("input_395_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_296_to_fp16 = const()[name = tensor<string, []>("const_296_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(248535168)))];
-            tensor<fp16, [1024]> const_297_to_fp16 = const()[name = tensor<string, []>("const_297_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(248553664)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_296_to_fp16 = const()[name = tensor<string, []>("const_296_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(341606720)))];
+            tensor<fp16, [1024]> const_297_to_fp16 = const()[name = tensor<string, []>("const_297_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(341625216)))];
             tensor<fp16, [1, 1024, 1, 188]> input_397_cast_fp16 = conv(bias = const_297_to_fp16, dilations = input_395_dilations_0, groups = input_395_groups_0, pad = input_395_pad_0, pad_type = input_395_pad_type_0, strides = input_395_strides_0, weight = const_296_to_fp16, x = input_393_cast_fp16)[name = tensor<string, []>("input_397_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_399_cast_fp16 = silu(x = input_397_cast_fp16)[name = tensor<string, []>("input_399_cast_fp16")];
             tensor<string, []> var_5769_pad_type_0 = const()[name = tensor<string, []>("op_5769_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4092,22 +4092,22 @@ program(1.0)
             tensor<int32, [4]> var_5769_pad_0 = const()[name = tensor<string, []>("op_5769_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5769_dilations_0 = const()[name = tensor<string, []>("op_5769_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5769_groups_0 = const()[name = tensor<string, []>("op_5769_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(248555776))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249080128))), name = tensor<string, []>("layers_14_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(341627328))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(342413824))), name = tensor<string, []>("layers_14_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5769_cast_fp16 = conv(dilations = var_5769_dilations_0, groups = var_5769_groups_0, pad = var_5769_pad_0, pad_type = var_5769_pad_type_0, strides = var_5769_strides_0, weight = layers_14_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_399_cast_fp16)[name = tensor<string, []>("op_5769_cast_fp16")];
             tensor<string, []> var_5775_pad_type_0 = const()[name = tensor<string, []>("op_5775_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5775_strides_0 = const()[name = tensor<string, []>("op_5775_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5775_pad_0 = const()[name = tensor<string, []>("op_5775_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5775_dilations_0 = const()[name = tensor<string, []>("op_5775_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5775_groups_0 = const()[name = tensor<string, []>("op_5775_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_14_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249110528))), name = tensor<string, []>("layers_14_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15079]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249080256))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_14_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(342444288))), name = tensor<string, []>("layers_14_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15079]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(342414016))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5775_cast_fp16 = conv(dilations = var_5775_dilations_0, groups = var_5775_groups_0, pad = var_5775_pad_0, pad_type = var_5775_pad_type_0, strides = var_5775_strides_0, weight = layers_14_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_399_cast_fp16)[name = tensor<string, []>("op_5775_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_89_cast_fp16 = add(x = var_5769_cast_fp16, y = var_5775_cast_fp16)[name = tensor<string, []>("x_89_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_147_cast_fp16 = add(x = inputs_145_cast_fp16, y = x_89_cast_fp16)[name = tensor<string, []>("inputs_147_cast_fp16")];
             tensor<int32, [1]> out_147_axes_0 = const()[name = tensor<string, []>("out_147_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5786_to_fp16 = const()[name = tensor<string, []>("op_5786_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_147_cast_fp16 = layer_norm(axes = out_147_axes_0, epsilon = var_5786_to_fp16, x = inputs_147_cast_fp16)[name = tensor<string, []>("out_147_cast_fp16")];
-            tensor<fp16, [1024]> input_401_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_401_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249241664)))];
-            tensor<fp16, [1024]> input_401_beta_0_to_fp16 = const()[name = tensor<string, []>("input_401_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249243776)))];
+            tensor<fp16, [1024]> input_401_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_401_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(342575424)))];
+            tensor<fp16, [1024]> input_401_beta_0_to_fp16 = const()[name = tensor<string, []>("input_401_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(342577536)))];
             tensor<fp16, []> input_401_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_401_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_401_cast_fp16 = batch_norm(beta = input_401_beta_0_to_fp16, epsilon = input_401_epsilon_0_to_fp16, gamma = input_401_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_147_cast_fp16)[name = tensor<string, []>("input_401_cast_fp16")];
             tensor<string, []> var_5806_pad_type_0 = const()[name = tensor<string, []>("op_5806_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4115,14 +4115,14 @@ program(1.0)
             tensor<int32, [4]> var_5806_pad_0 = const()[name = tensor<string, []>("op_5806_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5806_dilations_0 = const()[name = tensor<string, []>("op_5806_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5806_groups_0 = const()[name = tensor<string, []>("op_5806_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_14_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(249245888))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251343104))), name = tensor<string, []>("layers_14_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_14_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(342579648))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(345725440))), name = tensor<string, []>("layers_14_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5806_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_5806_dilations_0, groups = var_5806_groups_0, pad = var_5806_pad_0, pad_type = var_5806_pad_type_0, strides = var_5806_strides_0, weight = layers_14_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_401_cast_fp16)[name = tensor<string, []>("op_5806_cast_fp16")];
             tensor<string, []> var_5812_pad_type_0 = const()[name = tensor<string, []>("op_5812_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5812_strides_0 = const()[name = tensor<string, []>("op_5812_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5812_pad_0 = const()[name = tensor<string, []>("op_5812_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5812_dilations_0 = const()[name = tensor<string, []>("op_5812_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5812_groups_0 = const()[name = tensor<string, []>("op_5812_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_14_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251494592))), name = tensor<string, []>("layers_14_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75648]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(251343232))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_14_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(345876992))), name = tensor<string, []>("layers_14_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [75648]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(345725632))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5812_cast_fp16 = conv(dilations = var_5812_dilations_0, groups = var_5812_groups_0, pad = var_5812_pad_0, pad_type = var_5812_pad_type_0, strides = var_5812_strides_0, weight = layers_14_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_401_cast_fp16)[name = tensor<string, []>("op_5812_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_403_cast_fp16 = add(x = var_5806_cast_fp16, y = var_5812_cast_fp16)[name = tensor<string, []>("input_403_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_405_cast_fp16 = silu(x = input_403_cast_fp16)[name = tensor<string, []>("input_405_cast_fp16")];
@@ -4131,14 +4131,14 @@ program(1.0)
             tensor<int32, [4]> var_5823_pad_0 = const()[name = tensor<string, []>("op_5823_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5823_dilations_0 = const()[name = tensor<string, []>("op_5823_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5823_groups_0 = const()[name = tensor<string, []>("op_5823_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_14_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(252018944))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(254116160))), name = tensor<string, []>("layers_14_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_14_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(346401344))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(349547136))), name = tensor<string, []>("layers_14_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5823_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5823_dilations_0, groups = var_5823_groups_0, pad = var_5823_pad_0, pad_type = var_5823_pad_type_0, strides = var_5823_strides_0, weight = layers_14_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_405_cast_fp16)[name = tensor<string, []>("op_5823_cast_fp16")];
             tensor<string, []> var_5829_pad_type_0 = const()[name = tensor<string, []>("op_5829_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5829_strides_0 = const()[name = tensor<string, []>("op_5829_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5829_pad_0 = const()[name = tensor<string, []>("op_5829_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5829_dilations_0 = const()[name = tensor<string, []>("op_5829_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5829_groups_0 = const()[name = tensor<string, []>("op_5829_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_14_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(254276288))), name = tensor<string, []>("layers_14_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [79960]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(254116288))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_14_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(349707328))), name = tensor<string, []>("layers_14_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [79960]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(349547328))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5829_cast_fp16 = conv(dilations = var_5829_dilations_0, groups = var_5829_groups_0, pad = var_5829_pad_0, pad_type = var_5829_pad_type_0, strides = var_5829_strides_0, weight = layers_14_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_405_cast_fp16)[name = tensor<string, []>("op_5829_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_91_cast_fp16 = add(x = var_5823_cast_fp16, y = var_5829_cast_fp16)[name = tensor<string, []>("x_91_cast_fp16")];
             tensor<fp16, []> var_5831_to_fp16 = const()[name = tensor<string, []>("op_5831_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -4147,16 +4147,16 @@ program(1.0)
             tensor<int32, [1]> out_149_axes_0 = const()[name = tensor<string, []>("out_149_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5842_to_fp16 = const()[name = tensor<string, []>("op_5842_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_149_cast_fp16 = layer_norm(axes = out_149_axes_0, epsilon = var_5842_to_fp16, x = inputs_149_cast_fp16)[name = tensor<string, []>("out_149_cast_fp16")];
-            tensor<fp16, [1024]> inputs_151_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_151_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(254800640)))];
-            tensor<fp16, [1024]> inputs_151_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_151_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(254802752)))];
+            tensor<fp16, [1024]> inputs_151_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_151_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(350231680)))];
+            tensor<fp16, [1024]> inputs_151_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_151_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(350233792)))];
             tensor<fp16, []> inputs_151_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_151_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_151_cast_fp16 = batch_norm(beta = inputs_151_beta_0_to_fp16, epsilon = inputs_151_epsilon_0_to_fp16, gamma = inputs_151_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_149_cast_fp16)[name = tensor<string, []>("inputs_151_cast_fp16")];
             tensor<int32, []> var_5856 = const()[name = tensor<string, []>("op_5856"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_151_axes_0 = const()[name = tensor<string, []>("out_151_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5887_to_fp16 = const()[name = tensor<string, []>("op_5887_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_151_cast_fp16 = layer_norm(axes = out_151_axes_0, epsilon = var_5887_to_fp16, x = inputs_151_cast_fp16)[name = tensor<string, []>("out_151_cast_fp16")];
-            tensor<fp16, [1024]> input_407_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_407_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(254804864)))];
-            tensor<fp16, [1024]> input_407_beta_0_to_fp16 = const()[name = tensor<string, []>("input_407_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(254806976)))];
+            tensor<fp16, [1024]> input_407_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_407_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(350235904)))];
+            tensor<fp16, [1024]> input_407_beta_0_to_fp16 = const()[name = tensor<string, []>("input_407_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(350238016)))];
             tensor<fp16, []> input_407_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_407_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_407_cast_fp16 = batch_norm(beta = input_407_beta_0_to_fp16, epsilon = input_407_epsilon_0_to_fp16, gamma = input_407_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_151_cast_fp16)[name = tensor<string, []>("input_407_cast_fp16")];
             tensor<string, []> var_5907_pad_type_0 = const()[name = tensor<string, []>("op_5907_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4164,14 +4164,14 @@ program(1.0)
             tensor<int32, [4]> var_5907_pad_0 = const()[name = tensor<string, []>("op_5907_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5907_dilations_0 = const()[name = tensor<string, []>("op_5907_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5907_groups_0 = const()[name = tensor<string, []>("op_5907_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_15_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(254809088))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256906304))), name = tensor<string, []>("layers_15_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_15_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(350240128))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(353385920))), name = tensor<string, []>("layers_15_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5907_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_5907_dilations_0, groups = var_5907_groups_0, pad = var_5907_pad_0, pad_type = var_5907_pad_type_0, strides = var_5907_strides_0, weight = layers_15_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_407_cast_fp16)[name = tensor<string, []>("op_5907_cast_fp16")];
             tensor<string, []> var_5913_pad_type_0 = const()[name = tensor<string, []>("op_5913_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5913_strides_0 = const()[name = tensor<string, []>("op_5913_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5913_pad_0 = const()[name = tensor<string, []>("op_5913_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5913_dilations_0 = const()[name = tensor<string, []>("op_5913_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5913_groups_0 = const()[name = tensor<string, []>("op_5913_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_15_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(257084032))), name = tensor<string, []>("layers_15_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [88763]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(256906432))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_15_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(353563712))), name = tensor<string, []>("layers_15_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [88763]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(353386112))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_5913_cast_fp16 = conv(dilations = var_5913_dilations_0, groups = var_5913_groups_0, pad = var_5913_pad_0, pad_type = var_5913_pad_type_0, strides = var_5913_strides_0, weight = layers_15_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_407_cast_fp16)[name = tensor<string, []>("op_5913_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_409_cast_fp16 = add(x = var_5907_cast_fp16, y = var_5913_cast_fp16)[name = tensor<string, []>("input_409_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_411_cast_fp16 = silu(x = input_409_cast_fp16)[name = tensor<string, []>("input_411_cast_fp16")];
@@ -4180,14 +4180,14 @@ program(1.0)
             tensor<int32, [4]> var_5924_pad_0 = const()[name = tensor<string, []>("op_5924_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5924_dilations_0 = const()[name = tensor<string, []>("op_5924_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5924_groups_0 = const()[name = tensor<string, []>("op_5924_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_15_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(257608384))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(259705600))), name = tensor<string, []>("layers_15_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_15_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(354088064))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(357233856))), name = tensor<string, []>("layers_15_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5924_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5924_dilations_0, groups = var_5924_groups_0, pad = var_5924_pad_0, pad_type = var_5924_pad_type_0, strides = var_5924_strides_0, weight = layers_15_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_411_cast_fp16)[name = tensor<string, []>("op_5924_cast_fp16")];
             tensor<string, []> var_5930_pad_type_0 = const()[name = tensor<string, []>("op_5930_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5930_strides_0 = const()[name = tensor<string, []>("op_5930_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5930_pad_0 = const()[name = tensor<string, []>("op_5930_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5930_dilations_0 = const()[name = tensor<string, []>("op_5930_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5930_groups_0 = const()[name = tensor<string, []>("op_5930_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_15_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(259884736))), name = tensor<string, []>("layers_15_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89445]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(259705728))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_15_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(357413056))), name = tensor<string, []>("layers_15_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [89445]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(357234048))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5930_cast_fp16 = conv(dilations = var_5930_dilations_0, groups = var_5930_groups_0, pad = var_5930_pad_0, pad_type = var_5930_pad_type_0, strides = var_5930_strides_0, weight = layers_15_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_411_cast_fp16)[name = tensor<string, []>("op_5930_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_93_cast_fp16 = add(x = var_5924_cast_fp16, y = var_5930_cast_fp16)[name = tensor<string, []>("x_93_cast_fp16")];
             tensor<fp16, []> var_5932_to_fp16 = const()[name = tensor<string, []>("op_5932_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -4196,8 +4196,8 @@ program(1.0)
             tensor<int32, [1]> out_153_axes_0 = const()[name = tensor<string, []>("out_153_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_5943_to_fp16 = const()[name = tensor<string, []>("op_5943_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_153_cast_fp16 = layer_norm(axes = out_153_axes_0, epsilon = var_5943_to_fp16, x = inputs_153_cast_fp16)[name = tensor<string, []>("out_153_cast_fp16")];
-            tensor<fp16, [1024]> obj_63_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_63_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(260409088)))];
-            tensor<fp16, [1024]> obj_63_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_63_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(260411200)))];
+            tensor<fp16, [1024]> obj_63_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_63_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(357937408)))];
+            tensor<fp16, [1024]> obj_63_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_63_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(357939520)))];
             tensor<fp16, []> obj_63_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_63_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_63_cast_fp16 = batch_norm(beta = obj_63_beta_0_to_fp16, epsilon = obj_63_epsilon_0_to_fp16, gamma = obj_63_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_153_cast_fp16)[name = tensor<string, []>("obj_63_cast_fp16")];
             tensor<string, []> var_5968_pad_type_0 = const()[name = tensor<string, []>("op_5968_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4205,14 +4205,14 @@ program(1.0)
             tensor<int32, [4]> var_5968_pad_0 = const()[name = tensor<string, []>("op_5968_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5968_dilations_0 = const()[name = tensor<string, []>("op_5968_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5968_groups_0 = const()[name = tensor<string, []>("op_5968_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(260413312))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(260937664))), name = tensor<string, []>("layers_15_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(357941632))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(358728128))), name = tensor<string, []>("layers_15_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5968_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5968_dilations_0, groups = var_5968_groups_0, pad = var_5968_pad_0, pad_type = var_5968_pad_type_0, strides = var_5968_strides_0, weight = layers_15_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_63_cast_fp16)[name = tensor<string, []>("op_5968_cast_fp16")];
             tensor<string, []> var_5974_pad_type_0 = const()[name = tensor<string, []>("op_5974_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5974_strides_0 = const()[name = tensor<string, []>("op_5974_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5974_pad_0 = const()[name = tensor<string, []>("op_5974_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5974_dilations_0 = const()[name = tensor<string, []>("op_5974_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5974_groups_0 = const()[name = tensor<string, []>("op_5974_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(260968064))), name = tensor<string, []>("layers_15_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15094]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(260937792))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(358758592))), name = tensor<string, []>("layers_15_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15094]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(358728320))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5974_cast_fp16 = conv(dilations = var_5974_dilations_0, groups = var_5974_groups_0, pad = var_5974_pad_0, pad_type = var_5974_pad_type_0, strides = var_5974_strides_0, weight = layers_15_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_63_cast_fp16)[name = tensor<string, []>("op_5974_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_61_cast_fp16 = add(x = var_5968_cast_fp16, y = var_5974_cast_fp16)[name = tensor<string, []>("query_61_cast_fp16")];
             tensor<string, []> var_5983_pad_type_0 = const()[name = tensor<string, []>("op_5983_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4220,14 +4220,14 @@ program(1.0)
             tensor<int32, [4]> var_5983_pad_0 = const()[name = tensor<string, []>("op_5983_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5983_dilations_0 = const()[name = tensor<string, []>("op_5983_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5983_groups_0 = const()[name = tensor<string, []>("op_5983_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(261099200))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(261623552))), name = tensor<string, []>("layers_15_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(358889728))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(359676224))), name = tensor<string, []>("layers_15_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5983_cast_fp16 = conv(dilations = var_5983_dilations_0, groups = var_5983_groups_0, pad = var_5983_pad_0, pad_type = var_5983_pad_type_0, strides = var_5983_strides_0, weight = layers_15_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_63_cast_fp16)[name = tensor<string, []>("op_5983_cast_fp16")];
             tensor<string, []> var_5989_pad_type_0 = const()[name = tensor<string, []>("op_5989_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_5989_strides_0 = const()[name = tensor<string, []>("op_5989_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_5989_pad_0 = const()[name = tensor<string, []>("op_5989_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5989_dilations_0 = const()[name = tensor<string, []>("op_5989_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5989_groups_0 = const()[name = tensor<string, []>("op_5989_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(261660480))), name = tensor<string, []>("layers_15_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18341]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(261623680))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(359713216))), name = tensor<string, []>("layers_15_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18341]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(359676416))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5989_cast_fp16 = conv(dilations = var_5989_dilations_0, groups = var_5989_groups_0, pad = var_5989_pad_0, pad_type = var_5989_pad_type_0, strides = var_5989_strides_0, weight = layers_15_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_63_cast_fp16)[name = tensor<string, []>("op_5989_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_31_cast_fp16 = add(x = var_5983_cast_fp16, y = var_5989_cast_fp16)[name = tensor<string, []>("key_31_cast_fp16")];
             tensor<string, []> var_5999_pad_type_0 = const()[name = tensor<string, []>("op_5999_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4235,33 +4235,33 @@ program(1.0)
             tensor<int32, [4]> var_5999_pad_0 = const()[name = tensor<string, []>("op_5999_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_5999_dilations_0 = const()[name = tensor<string, []>("op_5999_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_5999_groups_0 = const()[name = tensor<string, []>("op_5999_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(261791616))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(262315968))), name = tensor<string, []>("layers_15_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(359844352))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360630848))), name = tensor<string, []>("layers_15_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_5999_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_5999_dilations_0, groups = var_5999_groups_0, pad = var_5999_pad_0, pad_type = var_5999_pad_type_0, strides = var_5999_strides_0, weight = layers_15_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_63_cast_fp16)[name = tensor<string, []>("op_5999_cast_fp16")];
             tensor<string, []> var_6005_pad_type_0 = const()[name = tensor<string, []>("op_6005_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6005_strides_0 = const()[name = tensor<string, []>("op_6005_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6005_pad_0 = const()[name = tensor<string, []>("op_6005_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6005_dilations_0 = const()[name = tensor<string, []>("op_6005_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6005_groups_0 = const()[name = tensor<string, []>("op_6005_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(262346880))), name = tensor<string, []>("layers_15_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15333]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(262316096))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360661824))), name = tensor<string, []>("layers_15_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15333]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360631040))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6005_cast_fp16 = conv(dilations = var_6005_dilations_0, groups = var_6005_groups_0, pad = var_6005_pad_0, pad_type = var_6005_pad_type_0, strides = var_6005_strides_0, weight = layers_15_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_63_cast_fp16)[name = tensor<string, []>("op_6005_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_31_cast_fp16 = add(x = var_5999_cast_fp16, y = var_6005_cast_fp16)[name = tensor<string, []>("value_31_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_6008_to_fp16 = const()[name = tensor<string, []>("op_6008_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(262478016)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_6008_to_fp16 = const()[name = tensor<string, []>("op_6008_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360792960)))];
             tensor<fp16, [1, 1024, 1, 188]> query_63_cast_fp16 = add(x = query_61_cast_fp16, y = var_6008_to_fp16)[name = tensor<string, []>("query_63_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_6011_to_fp16 = const()[name = tensor<string, []>("op_6011_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(262480128)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_6011_to_fp16 = const()[name = tensor<string, []>("op_6011_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360795072)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_31_cast_fp16 = add(x = query_61_cast_fp16, y = var_6011_to_fp16)[name = tensor<string, []>("q_with_bias_v_31_cast_fp16")];
             tensor<string, []> var_6021_pad_type_0 = const()[name = tensor<string, []>("op_6021_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6021_strides_0 = const()[name = tensor<string, []>("op_6021_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6021_pad_0 = const()[name = tensor<string, []>("op_6021_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6021_dilations_0 = const()[name = tensor<string, []>("op_6021_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6021_groups_0 = const()[name = tensor<string, []>("op_6021_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(262482240))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263006592))), name = tensor<string, []>("layers_15_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360797184))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(361583680))), name = tensor<string, []>("layers_15_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_6021_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6021_dilations_0, groups = var_6021_groups_0, pad = var_6021_pad_0, pad_type = var_6021_pad_type_0, strides = var_6021_strides_0, weight = layers_15_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_6021_cast_fp16")];
             tensor<string, []> var_6027_pad_type_0 = const()[name = tensor<string, []>("op_6027_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6027_strides_0 = const()[name = tensor<string, []>("op_6027_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6027_pad_0 = const()[name = tensor<string, []>("op_6027_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6027_dilations_0 = const()[name = tensor<string, []>("op_6027_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6027_groups_0 = const()[name = tensor<string, []>("op_6027_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263063296))), name = tensor<string, []>("layers_15_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [28233]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263006720))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(361640448))), name = tensor<string, []>("layers_15_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [28233]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(361583872))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_6027_cast_fp16 = conv(dilations = var_6027_dilations_0, groups = var_6027_groups_0, pad = var_6027_pad_0, pad_type = var_6027_pad_type_0, strides = var_6027_strides_0, weight = layers_15_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_6027_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_31_cast_fp16 = add(x = var_6021_cast_fp16, y = var_6027_cast_fp16)[name = tensor<string, []>("p_31_cast_fp16")];
             tensor<int32, [4]> var_6031 = const()[name = tensor<string, []>("op_6031"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -4312,22 +4312,22 @@ program(1.0)
             tensor<int32, [4]> var_6084_pad_0 = const()[name = tensor<string, []>("op_6084_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6084_dilations_0 = const()[name = tensor<string, []>("op_6084_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6084_groups_0 = const()[name = tensor<string, []>("op_6084_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263194432))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263718784))), name = tensor<string, []>("layers_15_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(361771584))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362558080))), name = tensor<string, []>("layers_15_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6084_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6084_dilations_0, groups = var_6084_groups_0, pad = var_6084_pad_0, pad_type = var_6084_pad_type_0, strides = var_6084_strides_0, weight = layers_15_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_413_cast_fp16)[name = tensor<string, []>("op_6084_cast_fp16")];
             tensor<string, []> var_6090_pad_type_0 = const()[name = tensor<string, []>("op_6090_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6090_strides_0 = const()[name = tensor<string, []>("op_6090_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6090_pad_0 = const()[name = tensor<string, []>("op_6090_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6090_dilations_0 = const()[name = tensor<string, []>("op_6090_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6090_groups_0 = const()[name = tensor<string, []>("op_6090_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263749504))), name = tensor<string, []>("layers_15_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15237]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263718912))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362588864))), name = tensor<string, []>("layers_15_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15237]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362558272))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6090_cast_fp16 = conv(dilations = var_6090_dilations_0, groups = var_6090_groups_0, pad = var_6090_pad_0, pad_type = var_6090_pad_type_0, strides = var_6090_strides_0, weight = layers_15_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_413_cast_fp16)[name = tensor<string, []>("op_6090_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_65_cast_fp16 = add(x = var_6084_cast_fp16, y = var_6090_cast_fp16)[name = tensor<string, []>("obj_65_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_155_cast_fp16 = add(x = inputs_153_cast_fp16, y = obj_65_cast_fp16)[name = tensor<string, []>("inputs_155_cast_fp16")];
             tensor<int32, [1]> out_155_axes_0 = const()[name = tensor<string, []>("out_155_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6101_to_fp16 = const()[name = tensor<string, []>("op_6101_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_155_cast_fp16 = layer_norm(axes = out_155_axes_0, epsilon = var_6101_to_fp16, x = inputs_155_cast_fp16)[name = tensor<string, []>("out_155_cast_fp16")];
-            tensor<fp16, [1024]> input_415_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_415_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263880640)))];
-            tensor<fp16, [1024]> input_415_beta_0_to_fp16 = const()[name = tensor<string, []>("input_415_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263882752)))];
+            tensor<fp16, [1024]> input_415_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_415_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362720000)))];
+            tensor<fp16, [1024]> input_415_beta_0_to_fp16 = const()[name = tensor<string, []>("input_415_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362722112)))];
             tensor<fp16, []> input_415_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_415_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_415_cast_fp16 = batch_norm(beta = input_415_beta_0_to_fp16, epsilon = input_415_epsilon_0_to_fp16, gamma = input_415_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_155_cast_fp16)[name = tensor<string, []>("input_415_cast_fp16")];
             tensor<string, []> var_6122_pad_type_0 = const()[name = tensor<string, []>("op_6122_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4335,14 +4335,14 @@ program(1.0)
             tensor<int32, [4]> var_6122_pad_0 = const()[name = tensor<string, []>("op_6122_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6122_dilations_0 = const()[name = tensor<string, []>("op_6122_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6122_groups_0 = const()[name = tensor<string, []>("op_6122_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_15_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263884864))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(264933504))), name = tensor<string, []>("layers_15_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_15_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362724224))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(364297152))), name = tensor<string, []>("layers_15_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_6122_cast_fp16 = conv(dilations = var_6122_dilations_0, groups = var_6122_groups_0, pad = var_6122_pad_0, pad_type = var_6122_pad_type_0, strides = var_6122_strides_0, weight = layers_15_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_415_cast_fp16)[name = tensor<string, []>("op_6122_cast_fp16")];
             tensor<string, []> var_6128_pad_type_0 = const()[name = tensor<string, []>("op_6128_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6128_strides_0 = const()[name = tensor<string, []>("op_6128_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6128_pad_0 = const()[name = tensor<string, []>("op_6128_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6128_dilations_0 = const()[name = tensor<string, []>("op_6128_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6128_groups_0 = const()[name = tensor<string, []>("op_6128_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_15_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(264996416))), name = tensor<string, []>("layers_15_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31342]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(264933632))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_15_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(364360128))), name = tensor<string, []>("layers_15_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31342]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(364297344))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_6128_cast_fp16 = conv(dilations = var_6128_dilations_0, groups = var_6128_groups_0, pad = var_6128_pad_0, pad_type = var_6128_pad_type_0, strides = var_6128_strides_0, weight = layers_15_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_415_cast_fp16)[name = tensor<string, []>("op_6128_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_417_cast_fp16 = add(x = var_6122_cast_fp16, y = var_6128_cast_fp16)[name = tensor<string, []>("input_417_cast_fp16")];
             tensor<int32, []> input_419_split_num_splits_0 = const()[name = tensor<string, []>("input_419_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -4355,8 +4355,8 @@ program(1.0)
             tensor<int32, []> input_421_groups_0 = const()[name = tensor<string, []>("input_421_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_421_strides_0 = const()[name = tensor<string, []>("input_421_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_421_dilations_0 = const()[name = tensor<string, []>("input_421_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_298_to_fp16 = const()[name = tensor<string, []>("const_298_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265258624)))];
-            tensor<fp16, [1024]> const_299_to_fp16 = const()[name = tensor<string, []>("const_299_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265277120)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_298_to_fp16 = const()[name = tensor<string, []>("const_298_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(364622336)))];
+            tensor<fp16, [1024]> const_299_to_fp16 = const()[name = tensor<string, []>("const_299_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(364640832)))];
             tensor<fp16, [1, 1024, 1, 188]> input_423_cast_fp16 = conv(bias = const_299_to_fp16, dilations = input_421_dilations_0, groups = input_421_groups_0, pad = input_421_pad_0, pad_type = input_421_pad_type_0, strides = input_421_strides_0, weight = const_298_to_fp16, x = input_419_cast_fp16)[name = tensor<string, []>("input_423_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_425_cast_fp16 = silu(x = input_423_cast_fp16)[name = tensor<string, []>("input_425_cast_fp16")];
             tensor<string, []> var_6150_pad_type_0 = const()[name = tensor<string, []>("op_6150_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4364,22 +4364,22 @@ program(1.0)
             tensor<int32, [4]> var_6150_pad_0 = const()[name = tensor<string, []>("op_6150_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6150_dilations_0 = const()[name = tensor<string, []>("op_6150_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6150_groups_0 = const()[name = tensor<string, []>("op_6150_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265279232))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265803584))), name = tensor<string, []>("layers_15_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(364642944))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365429440))), name = tensor<string, []>("layers_15_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6150_cast_fp16 = conv(dilations = var_6150_dilations_0, groups = var_6150_groups_0, pad = var_6150_pad_0, pad_type = var_6150_pad_type_0, strides = var_6150_strides_0, weight = layers_15_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_425_cast_fp16)[name = tensor<string, []>("op_6150_cast_fp16")];
             tensor<string, []> var_6156_pad_type_0 = const()[name = tensor<string, []>("op_6156_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6156_strides_0 = const()[name = tensor<string, []>("op_6156_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6156_pad_0 = const()[name = tensor<string, []>("op_6156_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6156_dilations_0 = const()[name = tensor<string, []>("op_6156_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6156_groups_0 = const()[name = tensor<string, []>("op_6156_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_15_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265834816))), name = tensor<string, []>("layers_15_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15517]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265803712))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_15_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365460736))), name = tensor<string, []>("layers_15_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15517]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365429632))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6156_cast_fp16 = conv(dilations = var_6156_dilations_0, groups = var_6156_groups_0, pad = var_6156_pad_0, pad_type = var_6156_pad_type_0, strides = var_6156_strides_0, weight = layers_15_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_425_cast_fp16)[name = tensor<string, []>("op_6156_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_95_cast_fp16 = add(x = var_6150_cast_fp16, y = var_6156_cast_fp16)[name = tensor<string, []>("x_95_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_157_cast_fp16 = add(x = inputs_155_cast_fp16, y = x_95_cast_fp16)[name = tensor<string, []>("inputs_157_cast_fp16")];
             tensor<int32, [1]> out_157_axes_0 = const()[name = tensor<string, []>("out_157_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6167_to_fp16 = const()[name = tensor<string, []>("op_6167_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_157_cast_fp16 = layer_norm(axes = out_157_axes_0, epsilon = var_6167_to_fp16, x = inputs_157_cast_fp16)[name = tensor<string, []>("out_157_cast_fp16")];
-            tensor<fp16, [1024]> input_427_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_427_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265965952)))];
-            tensor<fp16, [1024]> input_427_beta_0_to_fp16 = const()[name = tensor<string, []>("input_427_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265968064)))];
+            tensor<fp16, [1024]> input_427_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_427_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365591872)))];
+            tensor<fp16, [1024]> input_427_beta_0_to_fp16 = const()[name = tensor<string, []>("input_427_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365593984)))];
             tensor<fp16, []> input_427_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_427_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_427_cast_fp16 = batch_norm(beta = input_427_beta_0_to_fp16, epsilon = input_427_epsilon_0_to_fp16, gamma = input_427_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_157_cast_fp16)[name = tensor<string, []>("input_427_cast_fp16")];
             tensor<string, []> var_6187_pad_type_0 = const()[name = tensor<string, []>("op_6187_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4387,14 +4387,14 @@ program(1.0)
             tensor<int32, [4]> var_6187_pad_0 = const()[name = tensor<string, []>("op_6187_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6187_dilations_0 = const()[name = tensor<string, []>("op_6187_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6187_groups_0 = const()[name = tensor<string, []>("op_6187_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_15_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(265970176))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268067392))), name = tensor<string, []>("layers_15_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_15_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365596096))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(368741888))), name = tensor<string, []>("layers_15_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6187_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_6187_dilations_0, groups = var_6187_groups_0, pad = var_6187_pad_0, pad_type = var_6187_pad_type_0, strides = var_6187_strides_0, weight = layers_15_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_427_cast_fp16)[name = tensor<string, []>("op_6187_cast_fp16")];
             tensor<string, []> var_6193_pad_type_0 = const()[name = tensor<string, []>("op_6193_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6193_strides_0 = const()[name = tensor<string, []>("op_6193_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6193_pad_0 = const()[name = tensor<string, []>("op_6193_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6193_dilations_0 = const()[name = tensor<string, []>("op_6193_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6193_groups_0 = const()[name = tensor<string, []>("op_6193_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_15_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268222912))), name = tensor<string, []>("layers_15_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [77663]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268067520))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_15_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(368897472))), name = tensor<string, []>("layers_15_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [77663]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(368742080))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6193_cast_fp16 = conv(dilations = var_6193_dilations_0, groups = var_6193_groups_0, pad = var_6193_pad_0, pad_type = var_6193_pad_type_0, strides = var_6193_strides_0, weight = layers_15_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_427_cast_fp16)[name = tensor<string, []>("op_6193_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_429_cast_fp16 = add(x = var_6187_cast_fp16, y = var_6193_cast_fp16)[name = tensor<string, []>("input_429_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_431_cast_fp16 = silu(x = input_429_cast_fp16)[name = tensor<string, []>("input_431_cast_fp16")];
@@ -4403,14 +4403,14 @@ program(1.0)
             tensor<int32, [4]> var_6204_pad_0 = const()[name = tensor<string, []>("op_6204_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6204_dilations_0 = const()[name = tensor<string, []>("op_6204_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6204_groups_0 = const()[name = tensor<string, []>("op_6204_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_15_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(268747264))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(270844480))), name = tensor<string, []>("layers_15_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_15_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(369421824))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(372567616))), name = tensor<string, []>("layers_15_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6204_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6204_dilations_0, groups = var_6204_groups_0, pad = var_6204_pad_0, pad_type = var_6204_pad_type_0, strides = var_6204_strides_0, weight = layers_15_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_431_cast_fp16)[name = tensor<string, []>("op_6204_cast_fp16")];
             tensor<string, []> var_6210_pad_type_0 = const()[name = tensor<string, []>("op_6210_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6210_strides_0 = const()[name = tensor<string, []>("op_6210_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6210_pad_0 = const()[name = tensor<string, []>("op_6210_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6210_dilations_0 = const()[name = tensor<string, []>("op_6210_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6210_groups_0 = const()[name = tensor<string, []>("op_6210_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_15_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(271008448))), name = tensor<string, []>("layers_15_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81869]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(270844608))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_15_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(372731648))), name = tensor<string, []>("layers_15_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [81869]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(372567808))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6210_cast_fp16 = conv(dilations = var_6210_dilations_0, groups = var_6210_groups_0, pad = var_6210_pad_0, pad_type = var_6210_pad_type_0, strides = var_6210_strides_0, weight = layers_15_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_431_cast_fp16)[name = tensor<string, []>("op_6210_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_97_cast_fp16 = add(x = var_6204_cast_fp16, y = var_6210_cast_fp16)[name = tensor<string, []>("x_97_cast_fp16")];
             tensor<fp16, []> var_6212_to_fp16 = const()[name = tensor<string, []>("op_6212_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -4419,16 +4419,16 @@ program(1.0)
             tensor<int32, [1]> out_159_axes_0 = const()[name = tensor<string, []>("out_159_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6223_to_fp16 = const()[name = tensor<string, []>("op_6223_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_159_cast_fp16 = layer_norm(axes = out_159_axes_0, epsilon = var_6223_to_fp16, x = inputs_159_cast_fp16)[name = tensor<string, []>("out_159_cast_fp16")];
-            tensor<fp16, [1024]> inputs_161_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_161_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(271532800)))];
-            tensor<fp16, [1024]> inputs_161_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_161_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(271534912)))];
+            tensor<fp16, [1024]> inputs_161_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_161_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(373256000)))];
+            tensor<fp16, [1024]> inputs_161_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_161_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(373258112)))];
             tensor<fp16, []> inputs_161_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_161_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_161_cast_fp16 = batch_norm(beta = inputs_161_beta_0_to_fp16, epsilon = inputs_161_epsilon_0_to_fp16, gamma = inputs_161_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_159_cast_fp16)[name = tensor<string, []>("inputs_161_cast_fp16")];
             tensor<int32, []> var_6237 = const()[name = tensor<string, []>("op_6237"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_161_axes_0 = const()[name = tensor<string, []>("out_161_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6268_to_fp16 = const()[name = tensor<string, []>("op_6268_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_161_cast_fp16 = layer_norm(axes = out_161_axes_0, epsilon = var_6268_to_fp16, x = inputs_161_cast_fp16)[name = tensor<string, []>("out_161_cast_fp16")];
-            tensor<fp16, [1024]> input_433_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_433_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(271537024)))];
-            tensor<fp16, [1024]> input_433_beta_0_to_fp16 = const()[name = tensor<string, []>("input_433_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(271539136)))];
+            tensor<fp16, [1024]> input_433_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_433_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(373260224)))];
+            tensor<fp16, [1024]> input_433_beta_0_to_fp16 = const()[name = tensor<string, []>("input_433_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(373262336)))];
             tensor<fp16, []> input_433_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_433_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_433_cast_fp16 = batch_norm(beta = input_433_beta_0_to_fp16, epsilon = input_433_epsilon_0_to_fp16, gamma = input_433_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_161_cast_fp16)[name = tensor<string, []>("input_433_cast_fp16")];
             tensor<string, []> var_6288_pad_type_0 = const()[name = tensor<string, []>("op_6288_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4436,14 +4436,14 @@ program(1.0)
             tensor<int32, [4]> var_6288_pad_0 = const()[name = tensor<string, []>("op_6288_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6288_dilations_0 = const()[name = tensor<string, []>("op_6288_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6288_groups_0 = const()[name = tensor<string, []>("op_6288_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_16_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(271541248))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(273638464))), name = tensor<string, []>("layers_16_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_16_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(373264448))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(376410240))), name = tensor<string, []>("layers_16_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6288_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_6288_dilations_0, groups = var_6288_groups_0, pad = var_6288_pad_0, pad_type = var_6288_pad_type_0, strides = var_6288_strides_0, weight = layers_16_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_433_cast_fp16)[name = tensor<string, []>("op_6288_cast_fp16")];
             tensor<string, []> var_6294_pad_type_0 = const()[name = tensor<string, []>("op_6294_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6294_strides_0 = const()[name = tensor<string, []>("op_6294_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6294_pad_0 = const()[name = tensor<string, []>("op_6294_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6294_dilations_0 = const()[name = tensor<string, []>("op_6294_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6294_groups_0 = const()[name = tensor<string, []>("op_6294_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_16_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(273811712))), name = tensor<string, []>("layers_16_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [86501]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(273638592))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_16_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(376583552))), name = tensor<string, []>("layers_16_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [86501]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(376410432))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6294_cast_fp16 = conv(dilations = var_6294_dilations_0, groups = var_6294_groups_0, pad = var_6294_pad_0, pad_type = var_6294_pad_type_0, strides = var_6294_strides_0, weight = layers_16_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_433_cast_fp16)[name = tensor<string, []>("op_6294_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_435_cast_fp16 = add(x = var_6288_cast_fp16, y = var_6294_cast_fp16)[name = tensor<string, []>("input_435_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_437_cast_fp16 = silu(x = input_435_cast_fp16)[name = tensor<string, []>("input_437_cast_fp16")];
@@ -4452,14 +4452,14 @@ program(1.0)
             tensor<int32, [4]> var_6305_pad_0 = const()[name = tensor<string, []>("op_6305_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6305_dilations_0 = const()[name = tensor<string, []>("op_6305_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6305_groups_0 = const()[name = tensor<string, []>("op_6305_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_16_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(274336064))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(276433280))), name = tensor<string, []>("layers_16_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_16_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(377107904))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380253696))), name = tensor<string, []>("layers_16_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6305_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6305_dilations_0, groups = var_6305_groups_0, pad = var_6305_pad_0, pad_type = var_6305_pad_type_0, strides = var_6305_strides_0, weight = layers_16_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_437_cast_fp16)[name = tensor<string, []>("op_6305_cast_fp16")];
             tensor<string, []> var_6311_pad_type_0 = const()[name = tensor<string, []>("op_6311_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6311_strides_0 = const()[name = tensor<string, []>("op_6311_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6311_pad_0 = const()[name = tensor<string, []>("op_6311_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6311_dilations_0 = const()[name = tensor<string, []>("op_6311_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6311_groups_0 = const()[name = tensor<string, []>("op_6311_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_16_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(276609856))), name = tensor<string, []>("layers_16_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [88183]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(276433408))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_16_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380430336))), name = tensor<string, []>("layers_16_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [88183]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380253888))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6311_cast_fp16 = conv(dilations = var_6311_dilations_0, groups = var_6311_groups_0, pad = var_6311_pad_0, pad_type = var_6311_pad_type_0, strides = var_6311_strides_0, weight = layers_16_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_437_cast_fp16)[name = tensor<string, []>("op_6311_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_99_cast_fp16 = add(x = var_6305_cast_fp16, y = var_6311_cast_fp16)[name = tensor<string, []>("x_99_cast_fp16")];
             tensor<fp16, []> var_6313_to_fp16 = const()[name = tensor<string, []>("op_6313_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -4468,8 +4468,8 @@ program(1.0)
             tensor<int32, [1]> out_163_axes_0 = const()[name = tensor<string, []>("out_163_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6324_to_fp16 = const()[name = tensor<string, []>("op_6324_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_163_cast_fp16 = layer_norm(axes = out_163_axes_0, epsilon = var_6324_to_fp16, x = inputs_163_cast_fp16)[name = tensor<string, []>("out_163_cast_fp16")];
-            tensor<fp16, [1024]> obj_67_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_67_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(277134208)))];
-            tensor<fp16, [1024]> obj_67_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_67_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(277136320)))];
+            tensor<fp16, [1024]> obj_67_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_67_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380954688)))];
+            tensor<fp16, [1024]> obj_67_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_67_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380956800)))];
             tensor<fp16, []> obj_67_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_67_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_67_cast_fp16 = batch_norm(beta = obj_67_beta_0_to_fp16, epsilon = obj_67_epsilon_0_to_fp16, gamma = obj_67_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_163_cast_fp16)[name = tensor<string, []>("obj_67_cast_fp16")];
             tensor<string, []> var_6349_pad_type_0 = const()[name = tensor<string, []>("op_6349_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4477,14 +4477,14 @@ program(1.0)
             tensor<int32, [4]> var_6349_pad_0 = const()[name = tensor<string, []>("op_6349_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6349_dilations_0 = const()[name = tensor<string, []>("op_6349_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6349_groups_0 = const()[name = tensor<string, []>("op_6349_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(277138432))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(277662784))), name = tensor<string, []>("layers_16_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380958912))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381745408))), name = tensor<string, []>("layers_16_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6349_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6349_dilations_0, groups = var_6349_groups_0, pad = var_6349_pad_0, pad_type = var_6349_pad_type_0, strides = var_6349_strides_0, weight = layers_16_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_67_cast_fp16)[name = tensor<string, []>("op_6349_cast_fp16")];
             tensor<string, []> var_6355_pad_type_0 = const()[name = tensor<string, []>("op_6355_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6355_strides_0 = const()[name = tensor<string, []>("op_6355_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6355_pad_0 = const()[name = tensor<string, []>("op_6355_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6355_dilations_0 = const()[name = tensor<string, []>("op_6355_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6355_groups_0 = const()[name = tensor<string, []>("op_6355_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(277693056))), name = tensor<string, []>("layers_16_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15027]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(277662912))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381775744))), name = tensor<string, []>("layers_16_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15027]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381745600))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6355_cast_fp16 = conv(dilations = var_6355_dilations_0, groups = var_6355_groups_0, pad = var_6355_pad_0, pad_type = var_6355_pad_type_0, strides = var_6355_strides_0, weight = layers_16_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_67_cast_fp16)[name = tensor<string, []>("op_6355_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_65_cast_fp16 = add(x = var_6349_cast_fp16, y = var_6355_cast_fp16)[name = tensor<string, []>("query_65_cast_fp16")];
             tensor<string, []> var_6364_pad_type_0 = const()[name = tensor<string, []>("op_6364_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4492,14 +4492,14 @@ program(1.0)
             tensor<int32, [4]> var_6364_pad_0 = const()[name = tensor<string, []>("op_6364_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6364_dilations_0 = const()[name = tensor<string, []>("op_6364_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6364_groups_0 = const()[name = tensor<string, []>("op_6364_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(277824192))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(278348544))), name = tensor<string, []>("layers_16_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381906880))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382693376))), name = tensor<string, []>("layers_16_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6364_cast_fp16 = conv(dilations = var_6364_dilations_0, groups = var_6364_groups_0, pad = var_6364_pad_0, pad_type = var_6364_pad_type_0, strides = var_6364_strides_0, weight = layers_16_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_67_cast_fp16)[name = tensor<string, []>("op_6364_cast_fp16")];
             tensor<string, []> var_6370_pad_type_0 = const()[name = tensor<string, []>("op_6370_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6370_strides_0 = const()[name = tensor<string, []>("op_6370_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6370_pad_0 = const()[name = tensor<string, []>("op_6370_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6370_dilations_0 = const()[name = tensor<string, []>("op_6370_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6370_groups_0 = const()[name = tensor<string, []>("op_6370_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(278390272))), name = tensor<string, []>("layers_16_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [20758]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(278348672))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382735168))), name = tensor<string, []>("layers_16_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [20758]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382693568))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6370_cast_fp16 = conv(dilations = var_6370_dilations_0, groups = var_6370_groups_0, pad = var_6370_pad_0, pad_type = var_6370_pad_type_0, strides = var_6370_strides_0, weight = layers_16_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_67_cast_fp16)[name = tensor<string, []>("op_6370_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_33_cast_fp16 = add(x = var_6364_cast_fp16, y = var_6370_cast_fp16)[name = tensor<string, []>("key_33_cast_fp16")];
             tensor<string, []> var_6380_pad_type_0 = const()[name = tensor<string, []>("op_6380_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4507,33 +4507,33 @@ program(1.0)
             tensor<int32, [4]> var_6380_pad_0 = const()[name = tensor<string, []>("op_6380_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6380_dilations_0 = const()[name = tensor<string, []>("op_6380_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6380_groups_0 = const()[name = tensor<string, []>("op_6380_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(278521408))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279045760))), name = tensor<string, []>("layers_16_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382866304))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(383652800))), name = tensor<string, []>("layers_16_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6380_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6380_dilations_0, groups = var_6380_groups_0, pad = var_6380_pad_0, pad_type = var_6380_pad_type_0, strides = var_6380_strides_0, weight = layers_16_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_67_cast_fp16)[name = tensor<string, []>("op_6380_cast_fp16")];
             tensor<string, []> var_6386_pad_type_0 = const()[name = tensor<string, []>("op_6386_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6386_strides_0 = const()[name = tensor<string, []>("op_6386_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6386_pad_0 = const()[name = tensor<string, []>("op_6386_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6386_dilations_0 = const()[name = tensor<string, []>("op_6386_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6386_groups_0 = const()[name = tensor<string, []>("op_6386_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279078016))), name = tensor<string, []>("layers_16_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16020]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279045888))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(383685120))), name = tensor<string, []>("layers_16_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16020]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(383652992))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6386_cast_fp16 = conv(dilations = var_6386_dilations_0, groups = var_6386_groups_0, pad = var_6386_pad_0, pad_type = var_6386_pad_type_0, strides = var_6386_strides_0, weight = layers_16_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_67_cast_fp16)[name = tensor<string, []>("op_6386_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_33_cast_fp16 = add(x = var_6380_cast_fp16, y = var_6386_cast_fp16)[name = tensor<string, []>("value_33_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_6389_to_fp16 = const()[name = tensor<string, []>("op_6389_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279209152)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_6389_to_fp16 = const()[name = tensor<string, []>("op_6389_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(383816256)))];
             tensor<fp16, [1, 1024, 1, 188]> query_67_cast_fp16 = add(x = query_65_cast_fp16, y = var_6389_to_fp16)[name = tensor<string, []>("query_67_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_6392_to_fp16 = const()[name = tensor<string, []>("op_6392_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279211264)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_6392_to_fp16 = const()[name = tensor<string, []>("op_6392_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(383818368)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_33_cast_fp16 = add(x = query_65_cast_fp16, y = var_6392_to_fp16)[name = tensor<string, []>("q_with_bias_v_33_cast_fp16")];
             tensor<string, []> var_6402_pad_type_0 = const()[name = tensor<string, []>("op_6402_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6402_strides_0 = const()[name = tensor<string, []>("op_6402_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6402_pad_0 = const()[name = tensor<string, []>("op_6402_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6402_dilations_0 = const()[name = tensor<string, []>("op_6402_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6402_groups_0 = const()[name = tensor<string, []>("op_6402_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279213376))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279737728))), name = tensor<string, []>("layers_16_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(383820480))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(384606976))), name = tensor<string, []>("layers_16_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_6402_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6402_dilations_0, groups = var_6402_groups_0, pad = var_6402_pad_0, pad_type = var_6402_pad_type_0, strides = var_6402_strides_0, weight = layers_16_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_6402_cast_fp16")];
             tensor<string, []> var_6408_pad_type_0 = const()[name = tensor<string, []>("op_6408_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6408_strides_0 = const()[name = tensor<string, []>("op_6408_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6408_pad_0 = const()[name = tensor<string, []>("op_6408_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6408_dilations_0 = const()[name = tensor<string, []>("op_6408_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6408_groups_0 = const()[name = tensor<string, []>("op_6408_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279794368))), name = tensor<string, []>("layers_16_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [28201]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279737856))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(384663680))), name = tensor<string, []>("layers_16_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [28201]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(384607168))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_6408_cast_fp16 = conv(dilations = var_6408_dilations_0, groups = var_6408_groups_0, pad = var_6408_pad_0, pad_type = var_6408_pad_type_0, strides = var_6408_strides_0, weight = layers_16_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_6408_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_33_cast_fp16 = add(x = var_6402_cast_fp16, y = var_6408_cast_fp16)[name = tensor<string, []>("p_33_cast_fp16")];
             tensor<int32, [4]> var_6412 = const()[name = tensor<string, []>("op_6412"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -4584,22 +4584,22 @@ program(1.0)
             tensor<int32, [4]> var_6465_pad_0 = const()[name = tensor<string, []>("op_6465_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6465_dilations_0 = const()[name = tensor<string, []>("op_6465_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6465_groups_0 = const()[name = tensor<string, []>("op_6465_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(279925504))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(280449856))), name = tensor<string, []>("layers_16_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(384794816))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(385581312))), name = tensor<string, []>("layers_16_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6465_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6465_dilations_0, groups = var_6465_groups_0, pad = var_6465_pad_0, pad_type = var_6465_pad_type_0, strides = var_6465_strides_0, weight = layers_16_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_439_cast_fp16)[name = tensor<string, []>("op_6465_cast_fp16")];
             tensor<string, []> var_6471_pad_type_0 = const()[name = tensor<string, []>("op_6471_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6471_strides_0 = const()[name = tensor<string, []>("op_6471_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6471_pad_0 = const()[name = tensor<string, []>("op_6471_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6471_dilations_0 = const()[name = tensor<string, []>("op_6471_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6471_groups_0 = const()[name = tensor<string, []>("op_6471_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(280484480))), name = tensor<string, []>("layers_16_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17187]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(280449984))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(385616000))), name = tensor<string, []>("layers_16_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17187]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(385581504))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6471_cast_fp16 = conv(dilations = var_6471_dilations_0, groups = var_6471_groups_0, pad = var_6471_pad_0, pad_type = var_6471_pad_type_0, strides = var_6471_strides_0, weight = layers_16_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_439_cast_fp16)[name = tensor<string, []>("op_6471_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_69_cast_fp16 = add(x = var_6465_cast_fp16, y = var_6471_cast_fp16)[name = tensor<string, []>("obj_69_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_165_cast_fp16 = add(x = inputs_163_cast_fp16, y = obj_69_cast_fp16)[name = tensor<string, []>("inputs_165_cast_fp16")];
             tensor<int32, [1]> out_165_axes_0 = const()[name = tensor<string, []>("out_165_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6482_to_fp16 = const()[name = tensor<string, []>("op_6482_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_165_cast_fp16 = layer_norm(axes = out_165_axes_0, epsilon = var_6482_to_fp16, x = inputs_165_cast_fp16)[name = tensor<string, []>("out_165_cast_fp16")];
-            tensor<fp16, [1024]> input_441_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_441_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(280615616)))];
-            tensor<fp16, [1024]> input_441_beta_0_to_fp16 = const()[name = tensor<string, []>("input_441_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(280617728)))];
+            tensor<fp16, [1024]> input_441_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_441_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(385747136)))];
+            tensor<fp16, [1024]> input_441_beta_0_to_fp16 = const()[name = tensor<string, []>("input_441_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(385749248)))];
             tensor<fp16, []> input_441_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_441_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_441_cast_fp16 = batch_norm(beta = input_441_beta_0_to_fp16, epsilon = input_441_epsilon_0_to_fp16, gamma = input_441_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_165_cast_fp16)[name = tensor<string, []>("input_441_cast_fp16")];
             tensor<string, []> var_6503_pad_type_0 = const()[name = tensor<string, []>("op_6503_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4607,14 +4607,14 @@ program(1.0)
             tensor<int32, [4]> var_6503_pad_0 = const()[name = tensor<string, []>("op_6503_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6503_dilations_0 = const()[name = tensor<string, []>("op_6503_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6503_groups_0 = const()[name = tensor<string, []>("op_6503_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_16_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(280619840))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281668480))), name = tensor<string, []>("layers_16_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_16_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(385751360))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(387324288))), name = tensor<string, []>("layers_16_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_6503_cast_fp16 = conv(dilations = var_6503_dilations_0, groups = var_6503_groups_0, pad = var_6503_pad_0, pad_type = var_6503_pad_type_0, strides = var_6503_strides_0, weight = layers_16_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_441_cast_fp16)[name = tensor<string, []>("op_6503_cast_fp16")];
             tensor<string, []> var_6509_pad_type_0 = const()[name = tensor<string, []>("op_6509_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6509_strides_0 = const()[name = tensor<string, []>("op_6509_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6509_pad_0 = const()[name = tensor<string, []>("op_6509_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6509_dilations_0 = const()[name = tensor<string, []>("op_6509_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6509_groups_0 = const()[name = tensor<string, []>("op_6509_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_16_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281732224))), name = tensor<string, []>("layers_16_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31762]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281668608))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_16_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(387388096))), name = tensor<string, []>("layers_16_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [31762]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(387324480))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_6509_cast_fp16 = conv(dilations = var_6509_dilations_0, groups = var_6509_groups_0, pad = var_6509_pad_0, pad_type = var_6509_pad_type_0, strides = var_6509_strides_0, weight = layers_16_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_441_cast_fp16)[name = tensor<string, []>("op_6509_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_443_cast_fp16 = add(x = var_6503_cast_fp16, y = var_6509_cast_fp16)[name = tensor<string, []>("input_443_cast_fp16")];
             tensor<int32, []> input_445_split_num_splits_0 = const()[name = tensor<string, []>("input_445_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -4627,8 +4627,8 @@ program(1.0)
             tensor<int32, []> input_447_groups_0 = const()[name = tensor<string, []>("input_447_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_447_strides_0 = const()[name = tensor<string, []>("input_447_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_447_dilations_0 = const()[name = tensor<string, []>("input_447_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_300_to_fp16 = const()[name = tensor<string, []>("const_300_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(281994432)))];
-            tensor<fp16, [1024]> const_301_to_fp16 = const()[name = tensor<string, []>("const_301_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(282012928)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_300_to_fp16 = const()[name = tensor<string, []>("const_300_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(387650304)))];
+            tensor<fp16, [1024]> const_301_to_fp16 = const()[name = tensor<string, []>("const_301_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(387668800)))];
             tensor<fp16, [1, 1024, 1, 188]> input_449_cast_fp16 = conv(bias = const_301_to_fp16, dilations = input_447_dilations_0, groups = input_447_groups_0, pad = input_447_pad_0, pad_type = input_447_pad_type_0, strides = input_447_strides_0, weight = const_300_to_fp16, x = input_445_cast_fp16)[name = tensor<string, []>("input_449_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_451_cast_fp16 = silu(x = input_449_cast_fp16)[name = tensor<string, []>("input_451_cast_fp16")];
             tensor<string, []> var_6531_pad_type_0 = const()[name = tensor<string, []>("op_6531_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4636,22 +4636,22 @@ program(1.0)
             tensor<int32, [4]> var_6531_pad_0 = const()[name = tensor<string, []>("op_6531_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6531_dilations_0 = const()[name = tensor<string, []>("op_6531_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6531_groups_0 = const()[name = tensor<string, []>("op_6531_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(282015040))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(282539392))), name = tensor<string, []>("layers_16_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(387670912))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388457408))), name = tensor<string, []>("layers_16_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6531_cast_fp16 = conv(dilations = var_6531_dilations_0, groups = var_6531_groups_0, pad = var_6531_pad_0, pad_type = var_6531_pad_type_0, strides = var_6531_strides_0, weight = layers_16_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_451_cast_fp16)[name = tensor<string, []>("op_6531_cast_fp16")];
             tensor<string, []> var_6537_pad_type_0 = const()[name = tensor<string, []>("op_6537_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6537_strides_0 = const()[name = tensor<string, []>("op_6537_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6537_pad_0 = const()[name = tensor<string, []>("op_6537_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6537_dilations_0 = const()[name = tensor<string, []>("op_6537_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6537_groups_0 = const()[name = tensor<string, []>("op_6537_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_16_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(282570176))), name = tensor<string, []>("layers_16_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15278]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(282539520))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_16_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388488256))), name = tensor<string, []>("layers_16_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15278]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388457600))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6537_cast_fp16 = conv(dilations = var_6537_dilations_0, groups = var_6537_groups_0, pad = var_6537_pad_0, pad_type = var_6537_pad_type_0, strides = var_6537_strides_0, weight = layers_16_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_451_cast_fp16)[name = tensor<string, []>("op_6537_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_101_cast_fp16 = add(x = var_6531_cast_fp16, y = var_6537_cast_fp16)[name = tensor<string, []>("x_101_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_167_cast_fp16 = add(x = inputs_165_cast_fp16, y = x_101_cast_fp16)[name = tensor<string, []>("inputs_167_cast_fp16")];
             tensor<int32, [1]> out_167_axes_0 = const()[name = tensor<string, []>("out_167_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6548_to_fp16 = const()[name = tensor<string, []>("op_6548_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_167_cast_fp16 = layer_norm(axes = out_167_axes_0, epsilon = var_6548_to_fp16, x = inputs_167_cast_fp16)[name = tensor<string, []>("out_167_cast_fp16")];
-            tensor<fp16, [1024]> input_453_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_453_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(282701312)))];
-            tensor<fp16, [1024]> input_453_beta_0_to_fp16 = const()[name = tensor<string, []>("input_453_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(282703424)))];
+            tensor<fp16, [1024]> input_453_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_453_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388619392)))];
+            tensor<fp16, [1024]> input_453_beta_0_to_fp16 = const()[name = tensor<string, []>("input_453_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388621504)))];
             tensor<fp16, []> input_453_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_453_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_453_cast_fp16 = batch_norm(beta = input_453_beta_0_to_fp16, epsilon = input_453_epsilon_0_to_fp16, gamma = input_453_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_167_cast_fp16)[name = tensor<string, []>("input_453_cast_fp16")];
             tensor<string, []> var_6568_pad_type_0 = const()[name = tensor<string, []>("op_6568_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4659,14 +4659,14 @@ program(1.0)
             tensor<int32, [4]> var_6568_pad_0 = const()[name = tensor<string, []>("op_6568_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6568_dilations_0 = const()[name = tensor<string, []>("op_6568_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6568_groups_0 = const()[name = tensor<string, []>("op_6568_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_16_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(282705536))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284802752))), name = tensor<string, []>("layers_16_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_16_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388623616))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(391769408))), name = tensor<string, []>("layers_16_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6568_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_6568_dilations_0, groups = var_6568_groups_0, pad = var_6568_pad_0, pad_type = var_6568_pad_type_0, strides = var_6568_strides_0, weight = layers_16_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_453_cast_fp16)[name = tensor<string, []>("op_6568_cast_fp16")];
             tensor<string, []> var_6574_pad_type_0 = const()[name = tensor<string, []>("op_6574_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6574_strides_0 = const()[name = tensor<string, []>("op_6574_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6574_pad_0 = const()[name = tensor<string, []>("op_6574_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6574_dilations_0 = const()[name = tensor<string, []>("op_6574_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6574_groups_0 = const()[name = tensor<string, []>("op_6574_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_16_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284939008))), name = tensor<string, []>("layers_16_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [68003]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(284802880))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_16_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(391905728))), name = tensor<string, []>("layers_16_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [68003]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(391769600))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6574_cast_fp16 = conv(dilations = var_6574_dilations_0, groups = var_6574_groups_0, pad = var_6574_pad_0, pad_type = var_6574_pad_type_0, strides = var_6574_strides_0, weight = layers_16_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_453_cast_fp16)[name = tensor<string, []>("op_6574_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_455_cast_fp16 = add(x = var_6568_cast_fp16, y = var_6574_cast_fp16)[name = tensor<string, []>("input_455_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_457_cast_fp16 = silu(x = input_455_cast_fp16)[name = tensor<string, []>("input_457_cast_fp16")];
@@ -4675,14 +4675,14 @@ program(1.0)
             tensor<int32, [4]> var_6585_pad_0 = const()[name = tensor<string, []>("op_6585_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6585_dilations_0 = const()[name = tensor<string, []>("op_6585_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6585_groups_0 = const()[name = tensor<string, []>("op_6585_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_16_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(285463360))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(287560576))), name = tensor<string, []>("layers_16_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_16_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(392430080))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395575872))), name = tensor<string, []>("layers_16_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6585_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6585_dilations_0, groups = var_6585_groups_0, pad = var_6585_pad_0, pad_type = var_6585_pad_type_0, strides = var_6585_strides_0, weight = layers_16_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_457_cast_fp16)[name = tensor<string, []>("op_6585_cast_fp16")];
             tensor<string, []> var_6591_pad_type_0 = const()[name = tensor<string, []>("op_6591_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6591_strides_0 = const()[name = tensor<string, []>("op_6591_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6591_pad_0 = const()[name = tensor<string, []>("op_6591_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6591_dilations_0 = const()[name = tensor<string, []>("op_6591_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6591_groups_0 = const()[name = tensor<string, []>("op_6591_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_16_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(287700672))), name = tensor<string, []>("layers_16_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69940]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(287560704))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_16_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395716032))), name = tensor<string, []>("layers_16_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69940]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395576064))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6591_cast_fp16 = conv(dilations = var_6591_dilations_0, groups = var_6591_groups_0, pad = var_6591_pad_0, pad_type = var_6591_pad_type_0, strides = var_6591_strides_0, weight = layers_16_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_457_cast_fp16)[name = tensor<string, []>("op_6591_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_103_cast_fp16 = add(x = var_6585_cast_fp16, y = var_6591_cast_fp16)[name = tensor<string, []>("x_103_cast_fp16")];
             tensor<fp16, []> var_6593_to_fp16 = const()[name = tensor<string, []>("op_6593_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -4691,16 +4691,16 @@ program(1.0)
             tensor<int32, [1]> out_169_axes_0 = const()[name = tensor<string, []>("out_169_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6604_to_fp16 = const()[name = tensor<string, []>("op_6604_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_169_cast_fp16 = layer_norm(axes = out_169_axes_0, epsilon = var_6604_to_fp16, x = inputs_169_cast_fp16)[name = tensor<string, []>("out_169_cast_fp16")];
-            tensor<fp16, [1024]> inputs_171_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_171_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288225024)))];
-            tensor<fp16, [1024]> inputs_171_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_171_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288227136)))];
+            tensor<fp16, [1024]> inputs_171_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_171_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396240384)))];
+            tensor<fp16, [1024]> inputs_171_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_171_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396242496)))];
             tensor<fp16, []> inputs_171_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_171_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_171_cast_fp16 = batch_norm(beta = inputs_171_beta_0_to_fp16, epsilon = inputs_171_epsilon_0_to_fp16, gamma = inputs_171_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_169_cast_fp16)[name = tensor<string, []>("inputs_171_cast_fp16")];
             tensor<int32, []> var_6618 = const()[name = tensor<string, []>("op_6618"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_171_axes_0 = const()[name = tensor<string, []>("out_171_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6649_to_fp16 = const()[name = tensor<string, []>("op_6649_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_171_cast_fp16 = layer_norm(axes = out_171_axes_0, epsilon = var_6649_to_fp16, x = inputs_171_cast_fp16)[name = tensor<string, []>("out_171_cast_fp16")];
-            tensor<fp16, [1024]> input_459_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_459_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288229248)))];
-            tensor<fp16, [1024]> input_459_beta_0_to_fp16 = const()[name = tensor<string, []>("input_459_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288231360)))];
+            tensor<fp16, [1024]> input_459_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_459_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396244608)))];
+            tensor<fp16, [1024]> input_459_beta_0_to_fp16 = const()[name = tensor<string, []>("input_459_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396246720)))];
             tensor<fp16, []> input_459_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_459_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_459_cast_fp16 = batch_norm(beta = input_459_beta_0_to_fp16, epsilon = input_459_epsilon_0_to_fp16, gamma = input_459_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_171_cast_fp16)[name = tensor<string, []>("input_459_cast_fp16")];
             tensor<string, []> var_6669_pad_type_0 = const()[name = tensor<string, []>("op_6669_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4708,14 +4708,14 @@ program(1.0)
             tensor<int32, [4]> var_6669_pad_0 = const()[name = tensor<string, []>("op_6669_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6669_dilations_0 = const()[name = tensor<string, []>("op_6669_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6669_groups_0 = const()[name = tensor<string, []>("op_6669_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_17_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(288233472))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(290330688))), name = tensor<string, []>("layers_17_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_17_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396248832))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(399394624))), name = tensor<string, []>("layers_17_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6669_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_6669_dilations_0, groups = var_6669_groups_0, pad = var_6669_pad_0, pad_type = var_6669_pad_type_0, strides = var_6669_strides_0, weight = layers_17_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_459_cast_fp16)[name = tensor<string, []>("op_6669_cast_fp16")];
             tensor<string, []> var_6675_pad_type_0 = const()[name = tensor<string, []>("op_6675_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6675_strides_0 = const()[name = tensor<string, []>("op_6675_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6675_pad_0 = const()[name = tensor<string, []>("op_6675_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6675_dilations_0 = const()[name = tensor<string, []>("op_6675_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6675_groups_0 = const()[name = tensor<string, []>("op_6675_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_17_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(290475200))), name = tensor<string, []>("layers_17_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72139]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(290330816))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_17_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(399539200))), name = tensor<string, []>("layers_17_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72139]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(399394816))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6675_cast_fp16 = conv(dilations = var_6675_dilations_0, groups = var_6675_groups_0, pad = var_6675_pad_0, pad_type = var_6675_pad_type_0, strides = var_6675_strides_0, weight = layers_17_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_459_cast_fp16)[name = tensor<string, []>("op_6675_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_461_cast_fp16 = add(x = var_6669_cast_fp16, y = var_6675_cast_fp16)[name = tensor<string, []>("input_461_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_463_cast_fp16 = silu(x = input_461_cast_fp16)[name = tensor<string, []>("input_463_cast_fp16")];
@@ -4724,14 +4724,14 @@ program(1.0)
             tensor<int32, [4]> var_6686_pad_0 = const()[name = tensor<string, []>("op_6686_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6686_dilations_0 = const()[name = tensor<string, []>("op_6686_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6686_groups_0 = const()[name = tensor<string, []>("op_6686_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_17_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(290999552))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293096768))), name = tensor<string, []>("layers_17_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_17_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(400063552))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(403209344))), name = tensor<string, []>("layers_17_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6686_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6686_dilations_0, groups = var_6686_groups_0, pad = var_6686_pad_0, pad_type = var_6686_pad_type_0, strides = var_6686_strides_0, weight = layers_17_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_463_cast_fp16)[name = tensor<string, []>("op_6686_cast_fp16")];
             tensor<string, []> var_6692_pad_type_0 = const()[name = tensor<string, []>("op_6692_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6692_strides_0 = const()[name = tensor<string, []>("op_6692_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6692_pad_0 = const()[name = tensor<string, []>("op_6692_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6692_dilations_0 = const()[name = tensor<string, []>("op_6692_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6692_groups_0 = const()[name = tensor<string, []>("op_6692_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_17_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293241856))), name = tensor<string, []>("layers_17_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72428]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293096896))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_17_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(403354496))), name = tensor<string, []>("layers_17_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72428]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(403209536))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6692_cast_fp16 = conv(dilations = var_6692_dilations_0, groups = var_6692_groups_0, pad = var_6692_pad_0, pad_type = var_6692_pad_type_0, strides = var_6692_strides_0, weight = layers_17_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_463_cast_fp16)[name = tensor<string, []>("op_6692_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_105_cast_fp16 = add(x = var_6686_cast_fp16, y = var_6692_cast_fp16)[name = tensor<string, []>("x_105_cast_fp16")];
             tensor<fp16, []> var_6694_to_fp16 = const()[name = tensor<string, []>("op_6694_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -4740,8 +4740,8 @@ program(1.0)
             tensor<int32, [1]> out_173_axes_0 = const()[name = tensor<string, []>("out_173_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6705_to_fp16 = const()[name = tensor<string, []>("op_6705_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_173_cast_fp16 = layer_norm(axes = out_173_axes_0, epsilon = var_6705_to_fp16, x = inputs_173_cast_fp16)[name = tensor<string, []>("out_173_cast_fp16")];
-            tensor<fp16, [1024]> obj_71_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_71_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293766208)))];
-            tensor<fp16, [1024]> obj_71_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_71_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293768320)))];
+            tensor<fp16, [1024]> obj_71_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_71_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(403878848)))];
+            tensor<fp16, [1024]> obj_71_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_71_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(403880960)))];
             tensor<fp16, []> obj_71_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_71_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_71_cast_fp16 = batch_norm(beta = obj_71_beta_0_to_fp16, epsilon = obj_71_epsilon_0_to_fp16, gamma = obj_71_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_173_cast_fp16)[name = tensor<string, []>("obj_71_cast_fp16")];
             tensor<string, []> var_6730_pad_type_0 = const()[name = tensor<string, []>("op_6730_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4749,14 +4749,14 @@ program(1.0)
             tensor<int32, [4]> var_6730_pad_0 = const()[name = tensor<string, []>("op_6730_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6730_dilations_0 = const()[name = tensor<string, []>("op_6730_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6730_groups_0 = const()[name = tensor<string, []>("op_6730_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(293770432))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(294294784))), name = tensor<string, []>("layers_17_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(403883072))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404669568))), name = tensor<string, []>("layers_17_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6730_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6730_dilations_0, groups = var_6730_groups_0, pad = var_6730_pad_0, pad_type = var_6730_pad_type_0, strides = var_6730_strides_0, weight = layers_17_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_71_cast_fp16)[name = tensor<string, []>("op_6730_cast_fp16")];
             tensor<string, []> var_6736_pad_type_0 = const()[name = tensor<string, []>("op_6736_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6736_strides_0 = const()[name = tensor<string, []>("op_6736_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6736_pad_0 = const()[name = tensor<string, []>("op_6736_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6736_dilations_0 = const()[name = tensor<string, []>("op_6736_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6736_groups_0 = const()[name = tensor<string, []>("op_6736_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(294326080))), name = tensor<string, []>("layers_17_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15524]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(294294912))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404700928))), name = tensor<string, []>("layers_17_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15524]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404669760))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6736_cast_fp16 = conv(dilations = var_6736_dilations_0, groups = var_6736_groups_0, pad = var_6736_pad_0, pad_type = var_6736_pad_type_0, strides = var_6736_strides_0, weight = layers_17_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_71_cast_fp16)[name = tensor<string, []>("op_6736_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_69_cast_fp16 = add(x = var_6730_cast_fp16, y = var_6736_cast_fp16)[name = tensor<string, []>("query_69_cast_fp16")];
             tensor<string, []> var_6745_pad_type_0 = const()[name = tensor<string, []>("op_6745_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4764,14 +4764,14 @@ program(1.0)
             tensor<int32, [4]> var_6745_pad_0 = const()[name = tensor<string, []>("op_6745_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6745_dilations_0 = const()[name = tensor<string, []>("op_6745_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6745_groups_0 = const()[name = tensor<string, []>("op_6745_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(294457216))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(294981568))), name = tensor<string, []>("layers_17_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404832064))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(405618560))), name = tensor<string, []>("layers_17_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6745_cast_fp16 = conv(dilations = var_6745_dilations_0, groups = var_6745_groups_0, pad = var_6745_pad_0, pad_type = var_6745_pad_type_0, strides = var_6745_strides_0, weight = layers_17_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_71_cast_fp16)[name = tensor<string, []>("op_6745_cast_fp16")];
             tensor<string, []> var_6751_pad_type_0 = const()[name = tensor<string, []>("op_6751_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6751_strides_0 = const()[name = tensor<string, []>("op_6751_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6751_pad_0 = const()[name = tensor<string, []>("op_6751_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6751_dilations_0 = const()[name = tensor<string, []>("op_6751_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6751_groups_0 = const()[name = tensor<string, []>("op_6751_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295017408))), name = tensor<string, []>("layers_17_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17814]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(294981696))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(405654464))), name = tensor<string, []>("layers_17_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17814]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(405618752))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6751_cast_fp16 = conv(dilations = var_6751_dilations_0, groups = var_6751_groups_0, pad = var_6751_pad_0, pad_type = var_6751_pad_type_0, strides = var_6751_strides_0, weight = layers_17_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_71_cast_fp16)[name = tensor<string, []>("op_6751_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_35_cast_fp16 = add(x = var_6745_cast_fp16, y = var_6751_cast_fp16)[name = tensor<string, []>("key_35_cast_fp16")];
             tensor<string, []> var_6761_pad_type_0 = const()[name = tensor<string, []>("op_6761_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4779,33 +4779,33 @@ program(1.0)
             tensor<int32, [4]> var_6761_pad_0 = const()[name = tensor<string, []>("op_6761_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6761_dilations_0 = const()[name = tensor<string, []>("op_6761_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6761_groups_0 = const()[name = tensor<string, []>("op_6761_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295148544))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295672896))), name = tensor<string, []>("layers_17_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(405785600))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(406572096))), name = tensor<string, []>("layers_17_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6761_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6761_dilations_0, groups = var_6761_groups_0, pad = var_6761_pad_0, pad_type = var_6761_pad_type_0, strides = var_6761_strides_0, weight = layers_17_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_71_cast_fp16)[name = tensor<string, []>("op_6761_cast_fp16")];
             tensor<string, []> var_6767_pad_type_0 = const()[name = tensor<string, []>("op_6767_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6767_strides_0 = const()[name = tensor<string, []>("op_6767_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6767_pad_0 = const()[name = tensor<string, []>("op_6767_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6767_dilations_0 = const()[name = tensor<string, []>("op_6767_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6767_groups_0 = const()[name = tensor<string, []>("op_6767_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295704704))), name = tensor<string, []>("layers_17_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15802]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295673024))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(406603968))), name = tensor<string, []>("layers_17_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15802]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(406572288))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6767_cast_fp16 = conv(dilations = var_6767_dilations_0, groups = var_6767_groups_0, pad = var_6767_pad_0, pad_type = var_6767_pad_type_0, strides = var_6767_strides_0, weight = layers_17_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_71_cast_fp16)[name = tensor<string, []>("op_6767_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_35_cast_fp16 = add(x = var_6761_cast_fp16, y = var_6767_cast_fp16)[name = tensor<string, []>("value_35_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_6770_to_fp16 = const()[name = tensor<string, []>("op_6770_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295835840)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_6770_to_fp16 = const()[name = tensor<string, []>("op_6770_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(406735104)))];
             tensor<fp16, [1, 1024, 1, 188]> query_71_cast_fp16 = add(x = query_69_cast_fp16, y = var_6770_to_fp16)[name = tensor<string, []>("query_71_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_6773_to_fp16 = const()[name = tensor<string, []>("op_6773_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295837952)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_6773_to_fp16 = const()[name = tensor<string, []>("op_6773_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(406737216)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_35_cast_fp16 = add(x = query_69_cast_fp16, y = var_6773_to_fp16)[name = tensor<string, []>("q_with_bias_v_35_cast_fp16")];
             tensor<string, []> var_6783_pad_type_0 = const()[name = tensor<string, []>("op_6783_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6783_strides_0 = const()[name = tensor<string, []>("op_6783_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6783_pad_0 = const()[name = tensor<string, []>("op_6783_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6783_dilations_0 = const()[name = tensor<string, []>("op_6783_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6783_groups_0 = const()[name = tensor<string, []>("op_6783_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(295840064))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296364416))), name = tensor<string, []>("layers_17_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(406739328))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(407525824))), name = tensor<string, []>("layers_17_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_6783_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6783_dilations_0, groups = var_6783_groups_0, pad = var_6783_pad_0, pad_type = var_6783_pad_type_0, strides = var_6783_strides_0, weight = layers_17_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_6783_cast_fp16")];
             tensor<string, []> var_6789_pad_type_0 = const()[name = tensor<string, []>("op_6789_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6789_strides_0 = const()[name = tensor<string, []>("op_6789_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6789_pad_0 = const()[name = tensor<string, []>("op_6789_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6789_dilations_0 = const()[name = tensor<string, []>("op_6789_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6789_groups_0 = const()[name = tensor<string, []>("op_6789_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296419584))), name = tensor<string, []>("layers_17_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [27472]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296364544))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(407581056))), name = tensor<string, []>("layers_17_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [27472]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(407526016))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_6789_cast_fp16 = conv(dilations = var_6789_dilations_0, groups = var_6789_groups_0, pad = var_6789_pad_0, pad_type = var_6789_pad_type_0, strides = var_6789_strides_0, weight = layers_17_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_6789_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_35_cast_fp16 = add(x = var_6783_cast_fp16, y = var_6789_cast_fp16)[name = tensor<string, []>("p_35_cast_fp16")];
             tensor<int32, [4]> var_6793 = const()[name = tensor<string, []>("op_6793"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -4856,22 +4856,22 @@ program(1.0)
             tensor<int32, [4]> var_6846_pad_0 = const()[name = tensor<string, []>("op_6846_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6846_dilations_0 = const()[name = tensor<string, []>("op_6846_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6846_groups_0 = const()[name = tensor<string, []>("op_6846_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(296550720))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(297075072))), name = tensor<string, []>("layers_17_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(407712192))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(408498688))), name = tensor<string, []>("layers_17_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6846_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6846_dilations_0, groups = var_6846_groups_0, pad = var_6846_pad_0, pad_type = var_6846_pad_type_0, strides = var_6846_strides_0, weight = layers_17_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_465_cast_fp16)[name = tensor<string, []>("op_6846_cast_fp16")];
             tensor<string, []> var_6852_pad_type_0 = const()[name = tensor<string, []>("op_6852_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6852_strides_0 = const()[name = tensor<string, []>("op_6852_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6852_pad_0 = const()[name = tensor<string, []>("op_6852_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6852_dilations_0 = const()[name = tensor<string, []>("op_6852_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6852_groups_0 = const()[name = tensor<string, []>("op_6852_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(297106752))), name = tensor<string, []>("layers_17_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15729]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(297075200))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(408530432))), name = tensor<string, []>("layers_17_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15729]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(408498880))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6852_cast_fp16 = conv(dilations = var_6852_dilations_0, groups = var_6852_groups_0, pad = var_6852_pad_0, pad_type = var_6852_pad_type_0, strides = var_6852_strides_0, weight = layers_17_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_465_cast_fp16)[name = tensor<string, []>("op_6852_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_73_cast_fp16 = add(x = var_6846_cast_fp16, y = var_6852_cast_fp16)[name = tensor<string, []>("obj_73_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_175_cast_fp16 = add(x = inputs_173_cast_fp16, y = obj_73_cast_fp16)[name = tensor<string, []>("inputs_175_cast_fp16")];
             tensor<int32, [1]> out_175_axes_0 = const()[name = tensor<string, []>("out_175_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6863_to_fp16 = const()[name = tensor<string, []>("op_6863_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_175_cast_fp16 = layer_norm(axes = out_175_axes_0, epsilon = var_6863_to_fp16, x = inputs_175_cast_fp16)[name = tensor<string, []>("out_175_cast_fp16")];
-            tensor<fp16, [1024]> input_467_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_467_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(297237888)))];
-            tensor<fp16, [1024]> input_467_beta_0_to_fp16 = const()[name = tensor<string, []>("input_467_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(297240000)))];
+            tensor<fp16, [1024]> input_467_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_467_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(408661568)))];
+            tensor<fp16, [1024]> input_467_beta_0_to_fp16 = const()[name = tensor<string, []>("input_467_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(408663680)))];
             tensor<fp16, []> input_467_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_467_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_467_cast_fp16 = batch_norm(beta = input_467_beta_0_to_fp16, epsilon = input_467_epsilon_0_to_fp16, gamma = input_467_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_175_cast_fp16)[name = tensor<string, []>("input_467_cast_fp16")];
             tensor<string, []> var_6884_pad_type_0 = const()[name = tensor<string, []>("op_6884_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4879,14 +4879,14 @@ program(1.0)
             tensor<int32, [4]> var_6884_pad_0 = const()[name = tensor<string, []>("op_6884_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6884_dilations_0 = const()[name = tensor<string, []>("op_6884_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6884_groups_0 = const()[name = tensor<string, []>("op_6884_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_17_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(297242112))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(298290752))), name = tensor<string, []>("layers_17_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_17_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(408665792))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(410238720))), name = tensor<string, []>("layers_17_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_6884_cast_fp16 = conv(dilations = var_6884_dilations_0, groups = var_6884_groups_0, pad = var_6884_pad_0, pad_type = var_6884_pad_type_0, strides = var_6884_strides_0, weight = layers_17_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_467_cast_fp16)[name = tensor<string, []>("op_6884_cast_fp16")];
             tensor<string, []> var_6890_pad_type_0 = const()[name = tensor<string, []>("op_6890_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6890_strides_0 = const()[name = tensor<string, []>("op_6890_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6890_pad_0 = const()[name = tensor<string, []>("op_6890_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6890_dilations_0 = const()[name = tensor<string, []>("op_6890_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6890_groups_0 = const()[name = tensor<string, []>("op_6890_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_17_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(298355648))), name = tensor<string, []>("layers_17_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32331]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(298290880))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_17_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(410303680))), name = tensor<string, []>("layers_17_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32331]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(410238912))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_6890_cast_fp16 = conv(dilations = var_6890_dilations_0, groups = var_6890_groups_0, pad = var_6890_pad_0, pad_type = var_6890_pad_type_0, strides = var_6890_strides_0, weight = layers_17_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_467_cast_fp16)[name = tensor<string, []>("op_6890_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_469_cast_fp16 = add(x = var_6884_cast_fp16, y = var_6890_cast_fp16)[name = tensor<string, []>("input_469_cast_fp16")];
             tensor<int32, []> input_471_split_num_splits_0 = const()[name = tensor<string, []>("input_471_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -4899,8 +4899,8 @@ program(1.0)
             tensor<int32, []> input_473_groups_0 = const()[name = tensor<string, []>("input_473_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_473_strides_0 = const()[name = tensor<string, []>("input_473_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_473_dilations_0 = const()[name = tensor<string, []>("input_473_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_302_to_fp16 = const()[name = tensor<string, []>("const_302_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(298617856)))];
-            tensor<fp16, [1024]> const_303_to_fp16 = const()[name = tensor<string, []>("const_303_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(298636352)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_302_to_fp16 = const()[name = tensor<string, []>("const_302_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(410565888)))];
+            tensor<fp16, [1024]> const_303_to_fp16 = const()[name = tensor<string, []>("const_303_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(410584384)))];
             tensor<fp16, [1, 1024, 1, 188]> input_475_cast_fp16 = conv(bias = const_303_to_fp16, dilations = input_473_dilations_0, groups = input_473_groups_0, pad = input_473_pad_0, pad_type = input_473_pad_type_0, strides = input_473_strides_0, weight = const_302_to_fp16, x = input_471_cast_fp16)[name = tensor<string, []>("input_475_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_477_cast_fp16 = silu(x = input_475_cast_fp16)[name = tensor<string, []>("input_477_cast_fp16")];
             tensor<string, []> var_6912_pad_type_0 = const()[name = tensor<string, []>("op_6912_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4908,22 +4908,22 @@ program(1.0)
             tensor<int32, [4]> var_6912_pad_0 = const()[name = tensor<string, []>("op_6912_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6912_dilations_0 = const()[name = tensor<string, []>("op_6912_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6912_groups_0 = const()[name = tensor<string, []>("op_6912_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(298638464))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(299162816))), name = tensor<string, []>("layers_17_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(410586496))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(411372992))), name = tensor<string, []>("layers_17_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6912_cast_fp16 = conv(dilations = var_6912_dilations_0, groups = var_6912_groups_0, pad = var_6912_pad_0, pad_type = var_6912_pad_type_0, strides = var_6912_strides_0, weight = layers_17_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_477_cast_fp16)[name = tensor<string, []>("op_6912_cast_fp16")];
             tensor<string, []> var_6918_pad_type_0 = const()[name = tensor<string, []>("op_6918_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6918_strides_0 = const()[name = tensor<string, []>("op_6918_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6918_pad_0 = const()[name = tensor<string, []>("op_6918_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6918_dilations_0 = const()[name = tensor<string, []>("op_6918_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6918_groups_0 = const()[name = tensor<string, []>("op_6918_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_17_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(299193664))), name = tensor<string, []>("layers_17_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15300]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(299162944))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_17_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(411403904))), name = tensor<string, []>("layers_17_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15300]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(411373184))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6918_cast_fp16 = conv(dilations = var_6918_dilations_0, groups = var_6918_groups_0, pad = var_6918_pad_0, pad_type = var_6918_pad_type_0, strides = var_6918_strides_0, weight = layers_17_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_477_cast_fp16)[name = tensor<string, []>("op_6918_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_107_cast_fp16 = add(x = var_6912_cast_fp16, y = var_6918_cast_fp16)[name = tensor<string, []>("x_107_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_177_cast_fp16 = add(x = inputs_175_cast_fp16, y = x_107_cast_fp16)[name = tensor<string, []>("inputs_177_cast_fp16")];
             tensor<int32, [1]> out_177_axes_0 = const()[name = tensor<string, []>("out_177_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6929_to_fp16 = const()[name = tensor<string, []>("op_6929_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_177_cast_fp16 = layer_norm(axes = out_177_axes_0, epsilon = var_6929_to_fp16, x = inputs_177_cast_fp16)[name = tensor<string, []>("out_177_cast_fp16")];
-            tensor<fp16, [1024]> input_479_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_479_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(299324800)))];
-            tensor<fp16, [1024]> input_479_beta_0_to_fp16 = const()[name = tensor<string, []>("input_479_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(299326912)))];
+            tensor<fp16, [1024]> input_479_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_479_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(411535040)))];
+            tensor<fp16, [1024]> input_479_beta_0_to_fp16 = const()[name = tensor<string, []>("input_479_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(411537152)))];
             tensor<fp16, []> input_479_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_479_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_479_cast_fp16 = batch_norm(beta = input_479_beta_0_to_fp16, epsilon = input_479_epsilon_0_to_fp16, gamma = input_479_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_177_cast_fp16)[name = tensor<string, []>("input_479_cast_fp16")];
             tensor<string, []> var_6949_pad_type_0 = const()[name = tensor<string, []>("op_6949_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4931,14 +4931,14 @@ program(1.0)
             tensor<int32, [4]> var_6949_pad_0 = const()[name = tensor<string, []>("op_6949_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6949_dilations_0 = const()[name = tensor<string, []>("op_6949_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6949_groups_0 = const()[name = tensor<string, []>("op_6949_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_17_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(299329024))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(301426240))), name = tensor<string, []>("layers_17_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_17_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(411539264))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(414685056))), name = tensor<string, []>("layers_17_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6949_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_6949_dilations_0, groups = var_6949_groups_0, pad = var_6949_pad_0, pad_type = var_6949_pad_type_0, strides = var_6949_strides_0, weight = layers_17_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_479_cast_fp16)[name = tensor<string, []>("op_6949_cast_fp16")];
             tensor<string, []> var_6955_pad_type_0 = const()[name = tensor<string, []>("op_6955_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6955_strides_0 = const()[name = tensor<string, []>("op_6955_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6955_pad_0 = const()[name = tensor<string, []>("op_6955_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6955_dilations_0 = const()[name = tensor<string, []>("op_6955_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6955_groups_0 = const()[name = tensor<string, []>("op_6955_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_17_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(301560768))), name = tensor<string, []>("layers_17_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [67158]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(301426368))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_17_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(414819648))), name = tensor<string, []>("layers_17_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [67158]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(414685248))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_6955_cast_fp16 = conv(dilations = var_6955_dilations_0, groups = var_6955_groups_0, pad = var_6955_pad_0, pad_type = var_6955_pad_type_0, strides = var_6955_strides_0, weight = layers_17_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_479_cast_fp16)[name = tensor<string, []>("op_6955_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_481_cast_fp16 = add(x = var_6949_cast_fp16, y = var_6955_cast_fp16)[name = tensor<string, []>("input_481_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_483_cast_fp16 = silu(x = input_481_cast_fp16)[name = tensor<string, []>("input_483_cast_fp16")];
@@ -4947,14 +4947,14 @@ program(1.0)
             tensor<int32, [4]> var_6966_pad_0 = const()[name = tensor<string, []>("op_6966_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6966_dilations_0 = const()[name = tensor<string, []>("op_6966_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6966_groups_0 = const()[name = tensor<string, []>("op_6966_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_17_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(302085120))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304182336))), name = tensor<string, []>("layers_17_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_17_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(415344000))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(418489792))), name = tensor<string, []>("layers_17_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6966_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_6966_dilations_0, groups = var_6966_groups_0, pad = var_6966_pad_0, pad_type = var_6966_pad_type_0, strides = var_6966_strides_0, weight = layers_17_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_483_cast_fp16)[name = tensor<string, []>("op_6966_cast_fp16")];
             tensor<string, []> var_6972_pad_type_0 = const()[name = tensor<string, []>("op_6972_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_6972_strides_0 = const()[name = tensor<string, []>("op_6972_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_6972_pad_0 = const()[name = tensor<string, []>("op_6972_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_6972_dilations_0 = const()[name = tensor<string, []>("op_6972_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_6972_groups_0 = const()[name = tensor<string, []>("op_6972_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_17_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304324288))), name = tensor<string, []>("layers_17_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [70877]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304182464))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_17_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(418631808))), name = tensor<string, []>("layers_17_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [70877]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(418489984))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_6972_cast_fp16 = conv(dilations = var_6972_dilations_0, groups = var_6972_groups_0, pad = var_6972_pad_0, pad_type = var_6972_pad_type_0, strides = var_6972_strides_0, weight = layers_17_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_483_cast_fp16)[name = tensor<string, []>("op_6972_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_109_cast_fp16 = add(x = var_6966_cast_fp16, y = var_6972_cast_fp16)[name = tensor<string, []>("x_109_cast_fp16")];
             tensor<fp16, []> var_6974_to_fp16 = const()[name = tensor<string, []>("op_6974_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -4963,16 +4963,16 @@ program(1.0)
             tensor<int32, [1]> out_179_axes_0 = const()[name = tensor<string, []>("out_179_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_6985_to_fp16 = const()[name = tensor<string, []>("op_6985_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_179_cast_fp16 = layer_norm(axes = out_179_axes_0, epsilon = var_6985_to_fp16, x = inputs_179_cast_fp16)[name = tensor<string, []>("out_179_cast_fp16")];
-            tensor<fp16, [1024]> inputs_181_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_181_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304848640)))];
-            tensor<fp16, [1024]> inputs_181_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_181_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304850752)))];
+            tensor<fp16, [1024]> inputs_181_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_181_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(419156160)))];
+            tensor<fp16, [1024]> inputs_181_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_181_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(419158272)))];
             tensor<fp16, []> inputs_181_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_181_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_181_cast_fp16 = batch_norm(beta = inputs_181_beta_0_to_fp16, epsilon = inputs_181_epsilon_0_to_fp16, gamma = inputs_181_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_179_cast_fp16)[name = tensor<string, []>("inputs_181_cast_fp16")];
             tensor<int32, []> var_6999 = const()[name = tensor<string, []>("op_6999"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_181_axes_0 = const()[name = tensor<string, []>("out_181_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7030_to_fp16 = const()[name = tensor<string, []>("op_7030_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_181_cast_fp16 = layer_norm(axes = out_181_axes_0, epsilon = var_7030_to_fp16, x = inputs_181_cast_fp16)[name = tensor<string, []>("out_181_cast_fp16")];
-            tensor<fp16, [1024]> input_485_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_485_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304852864)))];
-            tensor<fp16, [1024]> input_485_beta_0_to_fp16 = const()[name = tensor<string, []>("input_485_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304854976)))];
+            tensor<fp16, [1024]> input_485_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_485_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(419160384)))];
+            tensor<fp16, [1024]> input_485_beta_0_to_fp16 = const()[name = tensor<string, []>("input_485_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(419162496)))];
             tensor<fp16, []> input_485_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_485_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_485_cast_fp16 = batch_norm(beta = input_485_beta_0_to_fp16, epsilon = input_485_epsilon_0_to_fp16, gamma = input_485_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_181_cast_fp16)[name = tensor<string, []>("input_485_cast_fp16")];
             tensor<string, []> var_7050_pad_type_0 = const()[name = tensor<string, []>("op_7050_pad_type_0"), val = tensor<string, []>("valid")];
@@ -4980,14 +4980,14 @@ program(1.0)
             tensor<int32, [4]> var_7050_pad_0 = const()[name = tensor<string, []>("op_7050_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7050_dilations_0 = const()[name = tensor<string, []>("op_7050_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7050_groups_0 = const()[name = tensor<string, []>("op_7050_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_18_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(304857088))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(306954304))), name = tensor<string, []>("layers_18_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_18_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(419164608))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(422310400))), name = tensor<string, []>("layers_18_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7050_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_7050_dilations_0, groups = var_7050_groups_0, pad = var_7050_pad_0, pad_type = var_7050_pad_type_0, strides = var_7050_strides_0, weight = layers_18_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_485_cast_fp16)[name = tensor<string, []>("op_7050_cast_fp16")];
             tensor<string, []> var_7056_pad_type_0 = const()[name = tensor<string, []>("op_7056_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7056_strides_0 = const()[name = tensor<string, []>("op_7056_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7056_pad_0 = const()[name = tensor<string, []>("op_7056_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7056_dilations_0 = const()[name = tensor<string, []>("op_7056_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7056_groups_0 = const()[name = tensor<string, []>("op_7056_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_18_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(307094400))), name = tensor<string, []>("layers_18_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69951]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(306954432))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_18_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(422450560))), name = tensor<string, []>("layers_18_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69951]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(422310592))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7056_cast_fp16 = conv(dilations = var_7056_dilations_0, groups = var_7056_groups_0, pad = var_7056_pad_0, pad_type = var_7056_pad_type_0, strides = var_7056_strides_0, weight = layers_18_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_485_cast_fp16)[name = tensor<string, []>("op_7056_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_487_cast_fp16 = add(x = var_7050_cast_fp16, y = var_7056_cast_fp16)[name = tensor<string, []>("input_487_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_489_cast_fp16 = silu(x = input_487_cast_fp16)[name = tensor<string, []>("input_489_cast_fp16")];
@@ -4996,14 +4996,14 @@ program(1.0)
             tensor<int32, [4]> var_7067_pad_0 = const()[name = tensor<string, []>("op_7067_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7067_dilations_0 = const()[name = tensor<string, []>("op_7067_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7067_groups_0 = const()[name = tensor<string, []>("op_7067_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_18_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(307618752))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(309715968))), name = tensor<string, []>("layers_18_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_18_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(422974912))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(426120704))), name = tensor<string, []>("layers_18_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7067_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7067_dilations_0, groups = var_7067_groups_0, pad = var_7067_pad_0, pad_type = var_7067_pad_type_0, strides = var_7067_strides_0, weight = layers_18_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_489_cast_fp16)[name = tensor<string, []>("op_7067_cast_fp16")];
             tensor<string, []> var_7073_pad_type_0 = const()[name = tensor<string, []>("op_7073_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7073_strides_0 = const()[name = tensor<string, []>("op_7073_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7073_pad_0 = const()[name = tensor<string, []>("op_7073_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7073_dilations_0 = const()[name = tensor<string, []>("op_7073_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7073_groups_0 = const()[name = tensor<string, []>("op_7073_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_18_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(309855168))), name = tensor<string, []>("layers_18_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69492]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(309716096))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_18_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(426259968))), name = tensor<string, []>("layers_18_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69492]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(426120896))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7073_cast_fp16 = conv(dilations = var_7073_dilations_0, groups = var_7073_groups_0, pad = var_7073_pad_0, pad_type = var_7073_pad_type_0, strides = var_7073_strides_0, weight = layers_18_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_489_cast_fp16)[name = tensor<string, []>("op_7073_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_111_cast_fp16 = add(x = var_7067_cast_fp16, y = var_7073_cast_fp16)[name = tensor<string, []>("x_111_cast_fp16")];
             tensor<fp16, []> var_7075_to_fp16 = const()[name = tensor<string, []>("op_7075_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -5012,8 +5012,8 @@ program(1.0)
             tensor<int32, [1]> out_183_axes_0 = const()[name = tensor<string, []>("out_183_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7086_to_fp16 = const()[name = tensor<string, []>("op_7086_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_183_cast_fp16 = layer_norm(axes = out_183_axes_0, epsilon = var_7086_to_fp16, x = inputs_183_cast_fp16)[name = tensor<string, []>("out_183_cast_fp16")];
-            tensor<fp16, [1024]> obj_75_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_75_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(310379520)))];
-            tensor<fp16, [1024]> obj_75_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_75_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(310381632)))];
+            tensor<fp16, [1024]> obj_75_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_75_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(426784320)))];
+            tensor<fp16, [1024]> obj_75_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_75_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(426786432)))];
             tensor<fp16, []> obj_75_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_75_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_75_cast_fp16 = batch_norm(beta = obj_75_beta_0_to_fp16, epsilon = obj_75_epsilon_0_to_fp16, gamma = obj_75_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_183_cast_fp16)[name = tensor<string, []>("obj_75_cast_fp16")];
             tensor<string, []> var_7111_pad_type_0 = const()[name = tensor<string, []>("op_7111_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5021,14 +5021,14 @@ program(1.0)
             tensor<int32, [4]> var_7111_pad_0 = const()[name = tensor<string, []>("op_7111_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7111_dilations_0 = const()[name = tensor<string, []>("op_7111_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7111_groups_0 = const()[name = tensor<string, []>("op_7111_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(310383744))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(310908096))), name = tensor<string, []>("layers_18_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(426788544))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(427575040))), name = tensor<string, []>("layers_18_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7111_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7111_dilations_0, groups = var_7111_groups_0, pad = var_7111_pad_0, pad_type = var_7111_pad_type_0, strides = var_7111_strides_0, weight = layers_18_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_75_cast_fp16)[name = tensor<string, []>("op_7111_cast_fp16")];
             tensor<string, []> var_7117_pad_type_0 = const()[name = tensor<string, []>("op_7117_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7117_strides_0 = const()[name = tensor<string, []>("op_7117_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7117_pad_0 = const()[name = tensor<string, []>("op_7117_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7117_dilations_0 = const()[name = tensor<string, []>("op_7117_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7117_groups_0 = const()[name = tensor<string, []>("op_7117_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(310940480))), name = tensor<string, []>("layers_18_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16084]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(310908224))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(427607488))), name = tensor<string, []>("layers_18_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16084]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(427575232))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7117_cast_fp16 = conv(dilations = var_7117_dilations_0, groups = var_7117_groups_0, pad = var_7117_pad_0, pad_type = var_7117_pad_type_0, strides = var_7117_strides_0, weight = layers_18_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_75_cast_fp16)[name = tensor<string, []>("op_7117_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_73_cast_fp16 = add(x = var_7111_cast_fp16, y = var_7117_cast_fp16)[name = tensor<string, []>("query_73_cast_fp16")];
             tensor<string, []> var_7126_pad_type_0 = const()[name = tensor<string, []>("op_7126_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5036,14 +5036,14 @@ program(1.0)
             tensor<int32, [4]> var_7126_pad_0 = const()[name = tensor<string, []>("op_7126_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7126_dilations_0 = const()[name = tensor<string, []>("op_7126_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7126_groups_0 = const()[name = tensor<string, []>("op_7126_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311071616))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311595968))), name = tensor<string, []>("layers_18_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(427738624))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(428525120))), name = tensor<string, []>("layers_18_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7126_cast_fp16 = conv(dilations = var_7126_dilations_0, groups = var_7126_groups_0, pad = var_7126_pad_0, pad_type = var_7126_pad_type_0, strides = var_7126_strides_0, weight = layers_18_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_75_cast_fp16)[name = tensor<string, []>("op_7126_cast_fp16")];
             tensor<string, []> var_7132_pad_type_0 = const()[name = tensor<string, []>("op_7132_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7132_strides_0 = const()[name = tensor<string, []>("op_7132_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7132_pad_0 = const()[name = tensor<string, []>("op_7132_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7132_dilations_0 = const()[name = tensor<string, []>("op_7132_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7132_groups_0 = const()[name = tensor<string, []>("op_7132_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311634496))), name = tensor<string, []>("layers_18_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19161]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311596096))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(428563712))), name = tensor<string, []>("layers_18_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [19161]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(428525312))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7132_cast_fp16 = conv(dilations = var_7132_dilations_0, groups = var_7132_groups_0, pad = var_7132_pad_0, pad_type = var_7132_pad_type_0, strides = var_7132_strides_0, weight = layers_18_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_75_cast_fp16)[name = tensor<string, []>("op_7132_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_37_cast_fp16 = add(x = var_7126_cast_fp16, y = var_7132_cast_fp16)[name = tensor<string, []>("key_37_cast_fp16")];
             tensor<string, []> var_7142_pad_type_0 = const()[name = tensor<string, []>("op_7142_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5051,33 +5051,33 @@ program(1.0)
             tensor<int32, [4]> var_7142_pad_0 = const()[name = tensor<string, []>("op_7142_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7142_dilations_0 = const()[name = tensor<string, []>("op_7142_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7142_groups_0 = const()[name = tensor<string, []>("op_7142_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(311765632))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312289984))), name = tensor<string, []>("layers_18_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(428694848))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(429481344))), name = tensor<string, []>("layers_18_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7142_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7142_dilations_0, groups = var_7142_groups_0, pad = var_7142_pad_0, pad_type = var_7142_pad_type_0, strides = var_7142_strides_0, weight = layers_18_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_75_cast_fp16)[name = tensor<string, []>("op_7142_cast_fp16")];
             tensor<string, []> var_7148_pad_type_0 = const()[name = tensor<string, []>("op_7148_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7148_strides_0 = const()[name = tensor<string, []>("op_7148_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7148_pad_0 = const()[name = tensor<string, []>("op_7148_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7148_dilations_0 = const()[name = tensor<string, []>("op_7148_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7148_groups_0 = const()[name = tensor<string, []>("op_7148_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312321344))), name = tensor<string, []>("layers_18_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15569]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312290112))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(429512768))), name = tensor<string, []>("layers_18_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15569]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(429481536))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7148_cast_fp16 = conv(dilations = var_7148_dilations_0, groups = var_7148_groups_0, pad = var_7148_pad_0, pad_type = var_7148_pad_type_0, strides = var_7148_strides_0, weight = layers_18_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_75_cast_fp16)[name = tensor<string, []>("op_7148_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_37_cast_fp16 = add(x = var_7142_cast_fp16, y = var_7148_cast_fp16)[name = tensor<string, []>("value_37_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_7151_to_fp16 = const()[name = tensor<string, []>("op_7151_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312452480)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_7151_to_fp16 = const()[name = tensor<string, []>("op_7151_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(429643904)))];
             tensor<fp16, [1, 1024, 1, 188]> query_75_cast_fp16 = add(x = query_73_cast_fp16, y = var_7151_to_fp16)[name = tensor<string, []>("query_75_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_7154_to_fp16 = const()[name = tensor<string, []>("op_7154_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312454592)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_7154_to_fp16 = const()[name = tensor<string, []>("op_7154_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(429646016)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_37_cast_fp16 = add(x = query_73_cast_fp16, y = var_7154_to_fp16)[name = tensor<string, []>("q_with_bias_v_37_cast_fp16")];
             tensor<string, []> var_7164_pad_type_0 = const()[name = tensor<string, []>("op_7164_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7164_strides_0 = const()[name = tensor<string, []>("op_7164_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7164_pad_0 = const()[name = tensor<string, []>("op_7164_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7164_dilations_0 = const()[name = tensor<string, []>("op_7164_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7164_groups_0 = const()[name = tensor<string, []>("op_7164_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312456704))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312981056))), name = tensor<string, []>("layers_18_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(429648128))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(430434624))), name = tensor<string, []>("layers_18_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_7164_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7164_dilations_0, groups = var_7164_groups_0, pad = var_7164_pad_0, pad_type = var_7164_pad_type_0, strides = var_7164_strides_0, weight = layers_18_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_7164_cast_fp16")];
             tensor<string, []> var_7170_pad_type_0 = const()[name = tensor<string, []>("op_7170_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7170_strides_0 = const()[name = tensor<string, []>("op_7170_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7170_pad_0 = const()[name = tensor<string, []>("op_7170_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7170_dilations_0 = const()[name = tensor<string, []>("op_7170_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7170_groups_0 = const()[name = tensor<string, []>("op_7170_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313033472))), name = tensor<string, []>("layers_18_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [26081]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(312981184))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(430487104))), name = tensor<string, []>("layers_18_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [26081]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(430434816))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_7170_cast_fp16 = conv(dilations = var_7170_dilations_0, groups = var_7170_groups_0, pad = var_7170_pad_0, pad_type = var_7170_pad_type_0, strides = var_7170_strides_0, weight = layers_18_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_7170_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_37_cast_fp16 = add(x = var_7164_cast_fp16, y = var_7170_cast_fp16)[name = tensor<string, []>("p_37_cast_fp16")];
             tensor<int32, [4]> var_7174 = const()[name = tensor<string, []>("op_7174"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -5128,22 +5128,22 @@ program(1.0)
             tensor<int32, [4]> var_7227_pad_0 = const()[name = tensor<string, []>("op_7227_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7227_dilations_0 = const()[name = tensor<string, []>("op_7227_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7227_groups_0 = const()[name = tensor<string, []>("op_7227_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313164608))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313688960))), name = tensor<string, []>("layers_18_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(430618240))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(431404736))), name = tensor<string, []>("layers_18_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7227_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7227_dilations_0, groups = var_7227_groups_0, pad = var_7227_pad_0, pad_type = var_7227_pad_type_0, strides = var_7227_strides_0, weight = layers_18_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_491_cast_fp16)[name = tensor<string, []>("op_7227_cast_fp16")];
             tensor<string, []> var_7233_pad_type_0 = const()[name = tensor<string, []>("op_7233_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7233_strides_0 = const()[name = tensor<string, []>("op_7233_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7233_pad_0 = const()[name = tensor<string, []>("op_7233_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7233_dilations_0 = const()[name = tensor<string, []>("op_7233_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7233_groups_0 = const()[name = tensor<string, []>("op_7233_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313721152))), name = tensor<string, []>("layers_18_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15969]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313689088))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(431436992))), name = tensor<string, []>("layers_18_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15969]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(431404928))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7233_cast_fp16 = conv(dilations = var_7233_dilations_0, groups = var_7233_groups_0, pad = var_7233_pad_0, pad_type = var_7233_pad_type_0, strides = var_7233_strides_0, weight = layers_18_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_491_cast_fp16)[name = tensor<string, []>("op_7233_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_77_cast_fp16 = add(x = var_7227_cast_fp16, y = var_7233_cast_fp16)[name = tensor<string, []>("obj_77_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_185_cast_fp16 = add(x = inputs_183_cast_fp16, y = obj_77_cast_fp16)[name = tensor<string, []>("inputs_185_cast_fp16")];
             tensor<int32, [1]> out_185_axes_0 = const()[name = tensor<string, []>("out_185_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7244_to_fp16 = const()[name = tensor<string, []>("op_7244_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_185_cast_fp16 = layer_norm(axes = out_185_axes_0, epsilon = var_7244_to_fp16, x = inputs_185_cast_fp16)[name = tensor<string, []>("out_185_cast_fp16")];
-            tensor<fp16, [1024]> input_493_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_493_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313852288)))];
-            tensor<fp16, [1024]> input_493_beta_0_to_fp16 = const()[name = tensor<string, []>("input_493_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313854400)))];
+            tensor<fp16, [1024]> input_493_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_493_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(431568128)))];
+            tensor<fp16, [1024]> input_493_beta_0_to_fp16 = const()[name = tensor<string, []>("input_493_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(431570240)))];
             tensor<fp16, []> input_493_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_493_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_493_cast_fp16 = batch_norm(beta = input_493_beta_0_to_fp16, epsilon = input_493_epsilon_0_to_fp16, gamma = input_493_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_185_cast_fp16)[name = tensor<string, []>("input_493_cast_fp16")];
             tensor<string, []> var_7265_pad_type_0 = const()[name = tensor<string, []>("op_7265_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5151,14 +5151,14 @@ program(1.0)
             tensor<int32, [4]> var_7265_pad_0 = const()[name = tensor<string, []>("op_7265_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7265_dilations_0 = const()[name = tensor<string, []>("op_7265_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7265_groups_0 = const()[name = tensor<string, []>("op_7265_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_18_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(313856512))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(314905152))), name = tensor<string, []>("layers_18_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_18_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(431572352))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(433145280))), name = tensor<string, []>("layers_18_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_7265_cast_fp16 = conv(dilations = var_7265_dilations_0, groups = var_7265_groups_0, pad = var_7265_pad_0, pad_type = var_7265_pad_type_0, strides = var_7265_strides_0, weight = layers_18_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_493_cast_fp16)[name = tensor<string, []>("op_7265_cast_fp16")];
             tensor<string, []> var_7271_pad_type_0 = const()[name = tensor<string, []>("op_7271_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7271_strides_0 = const()[name = tensor<string, []>("op_7271_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7271_pad_0 = const()[name = tensor<string, []>("op_7271_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7271_dilations_0 = const()[name = tensor<string, []>("op_7271_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7271_groups_0 = const()[name = tensor<string, []>("op_7271_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_18_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(314972352))), name = tensor<string, []>("layers_18_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33490]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(314905280))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_18_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(433212544))), name = tensor<string, []>("layers_18_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33490]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(433145472))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_7271_cast_fp16 = conv(dilations = var_7271_dilations_0, groups = var_7271_groups_0, pad = var_7271_pad_0, pad_type = var_7271_pad_type_0, strides = var_7271_strides_0, weight = layers_18_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_493_cast_fp16)[name = tensor<string, []>("op_7271_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_495_cast_fp16 = add(x = var_7265_cast_fp16, y = var_7271_cast_fp16)[name = tensor<string, []>("input_495_cast_fp16")];
             tensor<int32, []> input_497_split_num_splits_0 = const()[name = tensor<string, []>("input_497_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -5171,8 +5171,8 @@ program(1.0)
             tensor<int32, []> input_499_groups_0 = const()[name = tensor<string, []>("input_499_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_499_strides_0 = const()[name = tensor<string, []>("input_499_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_499_dilations_0 = const()[name = tensor<string, []>("input_499_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_304_to_fp16 = const()[name = tensor<string, []>("const_304_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315234560)))];
-            tensor<fp16, [1024]> const_305_to_fp16 = const()[name = tensor<string, []>("const_305_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315253056)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_304_to_fp16 = const()[name = tensor<string, []>("const_304_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(433474752)))];
+            tensor<fp16, [1024]> const_305_to_fp16 = const()[name = tensor<string, []>("const_305_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(433493248)))];
             tensor<fp16, [1, 1024, 1, 188]> input_501_cast_fp16 = conv(bias = const_305_to_fp16, dilations = input_499_dilations_0, groups = input_499_groups_0, pad = input_499_pad_0, pad_type = input_499_pad_type_0, strides = input_499_strides_0, weight = const_304_to_fp16, x = input_497_cast_fp16)[name = tensor<string, []>("input_501_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_503_cast_fp16 = silu(x = input_501_cast_fp16)[name = tensor<string, []>("input_503_cast_fp16")];
             tensor<string, []> var_7293_pad_type_0 = const()[name = tensor<string, []>("op_7293_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5180,22 +5180,22 @@ program(1.0)
             tensor<int32, [4]> var_7293_pad_0 = const()[name = tensor<string, []>("op_7293_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7293_dilations_0 = const()[name = tensor<string, []>("op_7293_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7293_groups_0 = const()[name = tensor<string, []>("op_7293_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315255168))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315779520))), name = tensor<string, []>("layers_18_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(433495360))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(434281856))), name = tensor<string, []>("layers_18_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7293_cast_fp16 = conv(dilations = var_7293_dilations_0, groups = var_7293_groups_0, pad = var_7293_pad_0, pad_type = var_7293_pad_type_0, strides = var_7293_strides_0, weight = layers_18_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_503_cast_fp16)[name = tensor<string, []>("op_7293_cast_fp16")];
             tensor<string, []> var_7299_pad_type_0 = const()[name = tensor<string, []>("op_7299_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7299_strides_0 = const()[name = tensor<string, []>("op_7299_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7299_pad_0 = const()[name = tensor<string, []>("op_7299_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7299_dilations_0 = const()[name = tensor<string, []>("op_7299_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7299_groups_0 = const()[name = tensor<string, []>("op_7299_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_18_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315810240))), name = tensor<string, []>("layers_18_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15234]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315779648))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_18_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(434312640))), name = tensor<string, []>("layers_18_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15234]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(434282048))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7299_cast_fp16 = conv(dilations = var_7299_dilations_0, groups = var_7299_groups_0, pad = var_7299_pad_0, pad_type = var_7299_pad_type_0, strides = var_7299_strides_0, weight = layers_18_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_503_cast_fp16)[name = tensor<string, []>("op_7299_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_113_cast_fp16 = add(x = var_7293_cast_fp16, y = var_7299_cast_fp16)[name = tensor<string, []>("x_113_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_187_cast_fp16 = add(x = inputs_185_cast_fp16, y = x_113_cast_fp16)[name = tensor<string, []>("inputs_187_cast_fp16")];
             tensor<int32, [1]> out_187_axes_0 = const()[name = tensor<string, []>("out_187_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7310_to_fp16 = const()[name = tensor<string, []>("op_7310_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_187_cast_fp16 = layer_norm(axes = out_187_axes_0, epsilon = var_7310_to_fp16, x = inputs_187_cast_fp16)[name = tensor<string, []>("out_187_cast_fp16")];
-            tensor<fp16, [1024]> input_505_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_505_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315941376)))];
-            tensor<fp16, [1024]> input_505_beta_0_to_fp16 = const()[name = tensor<string, []>("input_505_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315943488)))];
+            tensor<fp16, [1024]> input_505_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_505_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(434443776)))];
+            tensor<fp16, [1024]> input_505_beta_0_to_fp16 = const()[name = tensor<string, []>("input_505_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(434445888)))];
             tensor<fp16, []> input_505_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_505_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_505_cast_fp16 = batch_norm(beta = input_505_beta_0_to_fp16, epsilon = input_505_epsilon_0_to_fp16, gamma = input_505_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_187_cast_fp16)[name = tensor<string, []>("input_505_cast_fp16")];
             tensor<string, []> var_7330_pad_type_0 = const()[name = tensor<string, []>("op_7330_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5203,14 +5203,14 @@ program(1.0)
             tensor<int32, [4]> var_7330_pad_0 = const()[name = tensor<string, []>("op_7330_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7330_dilations_0 = const()[name = tensor<string, []>("op_7330_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7330_groups_0 = const()[name = tensor<string, []>("op_7330_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_18_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(315945600))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318042816))), name = tensor<string, []>("layers_18_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_18_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(434448000))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(437593792))), name = tensor<string, []>("layers_18_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7330_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_7330_dilations_0, groups = var_7330_groups_0, pad = var_7330_pad_0, pad_type = var_7330_pad_type_0, strides = var_7330_strides_0, weight = layers_18_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_505_cast_fp16)[name = tensor<string, []>("op_7330_cast_fp16")];
             tensor<string, []> var_7336_pad_type_0 = const()[name = tensor<string, []>("op_7336_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7336_strides_0 = const()[name = tensor<string, []>("op_7336_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7336_pad_0 = const()[name = tensor<string, []>("op_7336_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7336_dilations_0 = const()[name = tensor<string, []>("op_7336_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7336_groups_0 = const()[name = tensor<string, []>("op_7336_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_18_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318174336))), name = tensor<string, []>("layers_18_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [65643]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318042944))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_18_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(437725376))), name = tensor<string, []>("layers_18_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [65643]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(437593984))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7336_cast_fp16 = conv(dilations = var_7336_dilations_0, groups = var_7336_groups_0, pad = var_7336_pad_0, pad_type = var_7336_pad_type_0, strides = var_7336_strides_0, weight = layers_18_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_505_cast_fp16)[name = tensor<string, []>("op_7336_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_507_cast_fp16 = add(x = var_7330_cast_fp16, y = var_7336_cast_fp16)[name = tensor<string, []>("input_507_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_509_cast_fp16 = silu(x = input_507_cast_fp16)[name = tensor<string, []>("input_509_cast_fp16")];
@@ -5219,14 +5219,14 @@ program(1.0)
             tensor<int32, [4]> var_7347_pad_0 = const()[name = tensor<string, []>("op_7347_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7347_dilations_0 = const()[name = tensor<string, []>("op_7347_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7347_groups_0 = const()[name = tensor<string, []>("op_7347_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_18_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(318698688))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(320795904))), name = tensor<string, []>("layers_18_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_18_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(438249728))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(441395520))), name = tensor<string, []>("layers_18_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7347_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7347_dilations_0, groups = var_7347_groups_0, pad = var_7347_pad_0, pad_type = var_7347_pad_type_0, strides = var_7347_strides_0, weight = layers_18_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_509_cast_fp16)[name = tensor<string, []>("op_7347_cast_fp16")];
             tensor<string, []> var_7353_pad_type_0 = const()[name = tensor<string, []>("op_7353_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7353_strides_0 = const()[name = tensor<string, []>("op_7353_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7353_pad_0 = const()[name = tensor<string, []>("op_7353_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7353_dilations_0 = const()[name = tensor<string, []>("op_7353_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7353_groups_0 = const()[name = tensor<string, []>("op_7353_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_18_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(320933888))), name = tensor<string, []>("layers_18_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [68891]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(320796032))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_18_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(441533568))), name = tensor<string, []>("layers_18_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [68891]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(441395712))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7353_cast_fp16 = conv(dilations = var_7353_dilations_0, groups = var_7353_groups_0, pad = var_7353_pad_0, pad_type = var_7353_pad_type_0, strides = var_7353_strides_0, weight = layers_18_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_509_cast_fp16)[name = tensor<string, []>("op_7353_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_115_cast_fp16 = add(x = var_7347_cast_fp16, y = var_7353_cast_fp16)[name = tensor<string, []>("x_115_cast_fp16")];
             tensor<fp16, []> var_7355_to_fp16 = const()[name = tensor<string, []>("op_7355_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -5235,16 +5235,16 @@ program(1.0)
             tensor<int32, [1]> out_189_axes_0 = const()[name = tensor<string, []>("out_189_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7366_to_fp16 = const()[name = tensor<string, []>("op_7366_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_189_cast_fp16 = layer_norm(axes = out_189_axes_0, epsilon = var_7366_to_fp16, x = inputs_189_cast_fp16)[name = tensor<string, []>("out_189_cast_fp16")];
-            tensor<fp16, [1024]> inputs_191_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_191_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(321458240)))];
-            tensor<fp16, [1024]> inputs_191_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_191_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(321460352)))];
+            tensor<fp16, [1024]> inputs_191_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_191_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(442057920)))];
+            tensor<fp16, [1024]> inputs_191_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_191_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(442060032)))];
             tensor<fp16, []> inputs_191_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_191_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_191_cast_fp16 = batch_norm(beta = inputs_191_beta_0_to_fp16, epsilon = inputs_191_epsilon_0_to_fp16, gamma = inputs_191_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_189_cast_fp16)[name = tensor<string, []>("inputs_191_cast_fp16")];
             tensor<int32, []> var_7380 = const()[name = tensor<string, []>("op_7380"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_191_axes_0 = const()[name = tensor<string, []>("out_191_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7411_to_fp16 = const()[name = tensor<string, []>("op_7411_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_191_cast_fp16 = layer_norm(axes = out_191_axes_0, epsilon = var_7411_to_fp16, x = inputs_191_cast_fp16)[name = tensor<string, []>("out_191_cast_fp16")];
-            tensor<fp16, [1024]> input_511_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_511_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(321462464)))];
-            tensor<fp16, [1024]> input_511_beta_0_to_fp16 = const()[name = tensor<string, []>("input_511_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(321464576)))];
+            tensor<fp16, [1024]> input_511_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_511_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(442062144)))];
+            tensor<fp16, [1024]> input_511_beta_0_to_fp16 = const()[name = tensor<string, []>("input_511_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(442064256)))];
             tensor<fp16, []> input_511_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_511_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_511_cast_fp16 = batch_norm(beta = input_511_beta_0_to_fp16, epsilon = input_511_epsilon_0_to_fp16, gamma = input_511_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_191_cast_fp16)[name = tensor<string, []>("input_511_cast_fp16")];
             tensor<string, []> var_7431_pad_type_0 = const()[name = tensor<string, []>("op_7431_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5252,14 +5252,14 @@ program(1.0)
             tensor<int32, [4]> var_7431_pad_0 = const()[name = tensor<string, []>("op_7431_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7431_dilations_0 = const()[name = tensor<string, []>("op_7431_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7431_groups_0 = const()[name = tensor<string, []>("op_7431_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_19_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(321466688))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(323563904))), name = tensor<string, []>("layers_19_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_19_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(442066368))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(445212160))), name = tensor<string, []>("layers_19_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7431_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_7431_dilations_0, groups = var_7431_groups_0, pad = var_7431_pad_0, pad_type = var_7431_pad_type_0, strides = var_7431_strides_0, weight = layers_19_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_511_cast_fp16)[name = tensor<string, []>("op_7431_cast_fp16")];
             tensor<string, []> var_7437_pad_type_0 = const()[name = tensor<string, []>("op_7437_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7437_strides_0 = const()[name = tensor<string, []>("op_7437_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7437_pad_0 = const()[name = tensor<string, []>("op_7437_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7437_dilations_0 = const()[name = tensor<string, []>("op_7437_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7437_groups_0 = const()[name = tensor<string, []>("op_7437_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_19_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(323697280))), name = tensor<string, []>("layers_19_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [66573]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(323564032))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_19_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(445345600))), name = tensor<string, []>("layers_19_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [66573]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(445212352))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7437_cast_fp16 = conv(dilations = var_7437_dilations_0, groups = var_7437_groups_0, pad = var_7437_pad_0, pad_type = var_7437_pad_type_0, strides = var_7437_strides_0, weight = layers_19_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_511_cast_fp16)[name = tensor<string, []>("op_7437_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_513_cast_fp16 = add(x = var_7431_cast_fp16, y = var_7437_cast_fp16)[name = tensor<string, []>("input_513_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_515_cast_fp16 = silu(x = input_513_cast_fp16)[name = tensor<string, []>("input_515_cast_fp16")];
@@ -5268,14 +5268,14 @@ program(1.0)
             tensor<int32, [4]> var_7448_pad_0 = const()[name = tensor<string, []>("op_7448_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7448_dilations_0 = const()[name = tensor<string, []>("op_7448_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7448_groups_0 = const()[name = tensor<string, []>("op_7448_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_19_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(324221632))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(326318848))), name = tensor<string, []>("layers_19_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_19_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(445869952))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(449015744))), name = tensor<string, []>("layers_19_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7448_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7448_dilations_0, groups = var_7448_groups_0, pad = var_7448_pad_0, pad_type = var_7448_pad_type_0, strides = var_7448_strides_0, weight = layers_19_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_515_cast_fp16)[name = tensor<string, []>("op_7448_cast_fp16")];
             tensor<string, []> var_7454_pad_type_0 = const()[name = tensor<string, []>("op_7454_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7454_strides_0 = const()[name = tensor<string, []>("op_7454_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7454_pad_0 = const()[name = tensor<string, []>("op_7454_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7454_dilations_0 = const()[name = tensor<string, []>("op_7454_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7454_groups_0 = const()[name = tensor<string, []>("op_7454_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_19_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(326458560))), name = tensor<string, []>("layers_19_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69739]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(326318976))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_19_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(449155520))), name = tensor<string, []>("layers_19_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69739]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(449015936))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7454_cast_fp16 = conv(dilations = var_7454_dilations_0, groups = var_7454_groups_0, pad = var_7454_pad_0, pad_type = var_7454_pad_type_0, strides = var_7454_strides_0, weight = layers_19_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_515_cast_fp16)[name = tensor<string, []>("op_7454_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_117_cast_fp16 = add(x = var_7448_cast_fp16, y = var_7454_cast_fp16)[name = tensor<string, []>("x_117_cast_fp16")];
             tensor<fp16, []> var_7456_to_fp16 = const()[name = tensor<string, []>("op_7456_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -5284,8 +5284,8 @@ program(1.0)
             tensor<int32, [1]> out_193_axes_0 = const()[name = tensor<string, []>("out_193_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7467_to_fp16 = const()[name = tensor<string, []>("op_7467_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_193_cast_fp16 = layer_norm(axes = out_193_axes_0, epsilon = var_7467_to_fp16, x = inputs_193_cast_fp16)[name = tensor<string, []>("out_193_cast_fp16")];
-            tensor<fp16, [1024]> obj_79_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_79_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(326982912)))];
-            tensor<fp16, [1024]> obj_79_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_79_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(326985024)))];
+            tensor<fp16, [1024]> obj_79_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_79_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(449679872)))];
+            tensor<fp16, [1024]> obj_79_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_79_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(449681984)))];
             tensor<fp16, []> obj_79_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_79_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_79_cast_fp16 = batch_norm(beta = obj_79_beta_0_to_fp16, epsilon = obj_79_epsilon_0_to_fp16, gamma = obj_79_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_193_cast_fp16)[name = tensor<string, []>("obj_79_cast_fp16")];
             tensor<string, []> var_7492_pad_type_0 = const()[name = tensor<string, []>("op_7492_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5293,14 +5293,14 @@ program(1.0)
             tensor<int32, [4]> var_7492_pad_0 = const()[name = tensor<string, []>("op_7492_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7492_dilations_0 = const()[name = tensor<string, []>("op_7492_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7492_groups_0 = const()[name = tensor<string, []>("op_7492_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(326987136))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(327511488))), name = tensor<string, []>("layers_19_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(449684096))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(450470592))), name = tensor<string, []>("layers_19_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7492_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7492_dilations_0, groups = var_7492_groups_0, pad = var_7492_pad_0, pad_type = var_7492_pad_type_0, strides = var_7492_strides_0, weight = layers_19_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_79_cast_fp16)[name = tensor<string, []>("op_7492_cast_fp16")];
             tensor<string, []> var_7498_pad_type_0 = const()[name = tensor<string, []>("op_7498_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7498_strides_0 = const()[name = tensor<string, []>("op_7498_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7498_pad_0 = const()[name = tensor<string, []>("op_7498_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7498_dilations_0 = const()[name = tensor<string, []>("op_7498_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7498_groups_0 = const()[name = tensor<string, []>("op_7498_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(327544960))), name = tensor<string, []>("layers_19_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16633]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(327511616))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(450504128))), name = tensor<string, []>("layers_19_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16633]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(450470784))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7498_cast_fp16 = conv(dilations = var_7498_dilations_0, groups = var_7498_groups_0, pad = var_7498_pad_0, pad_type = var_7498_pad_type_0, strides = var_7498_strides_0, weight = layers_19_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_79_cast_fp16)[name = tensor<string, []>("op_7498_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_77_cast_fp16 = add(x = var_7492_cast_fp16, y = var_7498_cast_fp16)[name = tensor<string, []>("query_77_cast_fp16")];
             tensor<string, []> var_7507_pad_type_0 = const()[name = tensor<string, []>("op_7507_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5308,14 +5308,14 @@ program(1.0)
             tensor<int32, [4]> var_7507_pad_0 = const()[name = tensor<string, []>("op_7507_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7507_dilations_0 = const()[name = tensor<string, []>("op_7507_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7507_groups_0 = const()[name = tensor<string, []>("op_7507_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(327676096))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(328200448))), name = tensor<string, []>("layers_19_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(450635264))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(451421760))), name = tensor<string, []>("layers_19_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7507_cast_fp16 = conv(dilations = var_7507_dilations_0, groups = var_7507_groups_0, pad = var_7507_pad_0, pad_type = var_7507_pad_type_0, strides = var_7507_strides_0, weight = layers_19_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_79_cast_fp16)[name = tensor<string, []>("op_7507_cast_fp16")];
             tensor<string, []> var_7513_pad_type_0 = const()[name = tensor<string, []>("op_7513_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7513_strides_0 = const()[name = tensor<string, []>("op_7513_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7513_pad_0 = const()[name = tensor<string, []>("op_7513_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7513_dilations_0 = const()[name = tensor<string, []>("op_7513_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7513_groups_0 = const()[name = tensor<string, []>("op_7513_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(328234176))), name = tensor<string, []>("layers_19_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16738]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(328200576))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(451455552))), name = tensor<string, []>("layers_19_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16738]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(451421952))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7513_cast_fp16 = conv(dilations = var_7513_dilations_0, groups = var_7513_groups_0, pad = var_7513_pad_0, pad_type = var_7513_pad_type_0, strides = var_7513_strides_0, weight = layers_19_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_79_cast_fp16)[name = tensor<string, []>("op_7513_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_39_cast_fp16 = add(x = var_7507_cast_fp16, y = var_7513_cast_fp16)[name = tensor<string, []>("key_39_cast_fp16")];
             tensor<string, []> var_7523_pad_type_0 = const()[name = tensor<string, []>("op_7523_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5323,33 +5323,33 @@ program(1.0)
             tensor<int32, [4]> var_7523_pad_0 = const()[name = tensor<string, []>("op_7523_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7523_dilations_0 = const()[name = tensor<string, []>("op_7523_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7523_groups_0 = const()[name = tensor<string, []>("op_7523_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(328365312))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(328889664))), name = tensor<string, []>("layers_19_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(451586688))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(452373184))), name = tensor<string, []>("layers_19_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7523_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7523_dilations_0, groups = var_7523_groups_0, pad = var_7523_pad_0, pad_type = var_7523_pad_type_0, strides = var_7523_strides_0, weight = layers_19_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_79_cast_fp16)[name = tensor<string, []>("op_7523_cast_fp16")];
             tensor<string, []> var_7529_pad_type_0 = const()[name = tensor<string, []>("op_7529_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7529_strides_0 = const()[name = tensor<string, []>("op_7529_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7529_pad_0 = const()[name = tensor<string, []>("op_7529_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7529_dilations_0 = const()[name = tensor<string, []>("op_7529_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7529_groups_0 = const()[name = tensor<string, []>("op_7529_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(328918720))), name = tensor<string, []>("layers_19_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14428]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(328889792))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(452402304))), name = tensor<string, []>("layers_19_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14428]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(452373376))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7529_cast_fp16 = conv(dilations = var_7529_dilations_0, groups = var_7529_groups_0, pad = var_7529_pad_0, pad_type = var_7529_pad_type_0, strides = var_7529_strides_0, weight = layers_19_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_79_cast_fp16)[name = tensor<string, []>("op_7529_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_39_cast_fp16 = add(x = var_7523_cast_fp16, y = var_7529_cast_fp16)[name = tensor<string, []>("value_39_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_7532_to_fp16 = const()[name = tensor<string, []>("op_7532_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(329049856)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_7532_to_fp16 = const()[name = tensor<string, []>("op_7532_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(452533440)))];
             tensor<fp16, [1, 1024, 1, 188]> query_79_cast_fp16 = add(x = query_77_cast_fp16, y = var_7532_to_fp16)[name = tensor<string, []>("query_79_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_7535_to_fp16 = const()[name = tensor<string, []>("op_7535_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(329051968)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_7535_to_fp16 = const()[name = tensor<string, []>("op_7535_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(452535552)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_39_cast_fp16 = add(x = query_77_cast_fp16, y = var_7535_to_fp16)[name = tensor<string, []>("q_with_bias_v_39_cast_fp16")];
             tensor<string, []> var_7545_pad_type_0 = const()[name = tensor<string, []>("op_7545_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7545_strides_0 = const()[name = tensor<string, []>("op_7545_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7545_pad_0 = const()[name = tensor<string, []>("op_7545_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7545_dilations_0 = const()[name = tensor<string, []>("op_7545_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7545_groups_0 = const()[name = tensor<string, []>("op_7545_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(329054080))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(329578432))), name = tensor<string, []>("layers_19_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(452537664))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(453324160))), name = tensor<string, []>("layers_19_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_7545_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7545_dilations_0, groups = var_7545_groups_0, pad = var_7545_pad_0, pad_type = var_7545_pad_type_0, strides = var_7545_strides_0, weight = layers_19_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_7545_cast_fp16")];
             tensor<string, []> var_7551_pad_type_0 = const()[name = tensor<string, []>("op_7551_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7551_strides_0 = const()[name = tensor<string, []>("op_7551_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7551_pad_0 = const()[name = tensor<string, []>("op_7551_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7551_dilations_0 = const()[name = tensor<string, []>("op_7551_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7551_groups_0 = const()[name = tensor<string, []>("op_7551_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(329636544))), name = tensor<string, []>("layers_19_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [28947]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(329578560))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(453382336))), name = tensor<string, []>("layers_19_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [28947]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(453324352))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_7551_cast_fp16 = conv(dilations = var_7551_dilations_0, groups = var_7551_groups_0, pad = var_7551_pad_0, pad_type = var_7551_pad_type_0, strides = var_7551_strides_0, weight = layers_19_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_7551_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_39_cast_fp16 = add(x = var_7545_cast_fp16, y = var_7551_cast_fp16)[name = tensor<string, []>("p_39_cast_fp16")];
             tensor<int32, [4]> var_7555 = const()[name = tensor<string, []>("op_7555"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -5400,22 +5400,22 @@ program(1.0)
             tensor<int32, [4]> var_7608_pad_0 = const()[name = tensor<string, []>("op_7608_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7608_dilations_0 = const()[name = tensor<string, []>("op_7608_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7608_groups_0 = const()[name = tensor<string, []>("op_7608_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(329767680))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(330292032))), name = tensor<string, []>("layers_19_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(453513472))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(454299968))), name = tensor<string, []>("layers_19_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7608_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7608_dilations_0, groups = var_7608_groups_0, pad = var_7608_pad_0, pad_type = var_7608_pad_type_0, strides = var_7608_strides_0, weight = layers_19_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_517_cast_fp16)[name = tensor<string, []>("op_7608_cast_fp16")];
             tensor<string, []> var_7614_pad_type_0 = const()[name = tensor<string, []>("op_7614_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7614_strides_0 = const()[name = tensor<string, []>("op_7614_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7614_pad_0 = const()[name = tensor<string, []>("op_7614_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7614_dilations_0 = const()[name = tensor<string, []>("op_7614_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7614_groups_0 = const()[name = tensor<string, []>("op_7614_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(330321920))), name = tensor<string, []>("layers_19_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14839]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(330292160))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(454329920))), name = tensor<string, []>("layers_19_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14839]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(454300160))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7614_cast_fp16 = conv(dilations = var_7614_dilations_0, groups = var_7614_groups_0, pad = var_7614_pad_0, pad_type = var_7614_pad_type_0, strides = var_7614_strides_0, weight = layers_19_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_517_cast_fp16)[name = tensor<string, []>("op_7614_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_81_cast_fp16 = add(x = var_7608_cast_fp16, y = var_7614_cast_fp16)[name = tensor<string, []>("obj_81_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_195_cast_fp16 = add(x = inputs_193_cast_fp16, y = obj_81_cast_fp16)[name = tensor<string, []>("inputs_195_cast_fp16")];
             tensor<int32, [1]> out_195_axes_0 = const()[name = tensor<string, []>("out_195_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7625_to_fp16 = const()[name = tensor<string, []>("op_7625_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_195_cast_fp16 = layer_norm(axes = out_195_axes_0, epsilon = var_7625_to_fp16, x = inputs_195_cast_fp16)[name = tensor<string, []>("out_195_cast_fp16")];
-            tensor<fp16, [1024]> input_519_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_519_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(330453056)))];
-            tensor<fp16, [1024]> input_519_beta_0_to_fp16 = const()[name = tensor<string, []>("input_519_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(330455168)))];
+            tensor<fp16, [1024]> input_519_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_519_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(454461056)))];
+            tensor<fp16, [1024]> input_519_beta_0_to_fp16 = const()[name = tensor<string, []>("input_519_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(454463168)))];
             tensor<fp16, []> input_519_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_519_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_519_cast_fp16 = batch_norm(beta = input_519_beta_0_to_fp16, epsilon = input_519_epsilon_0_to_fp16, gamma = input_519_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_195_cast_fp16)[name = tensor<string, []>("input_519_cast_fp16")];
             tensor<string, []> var_7646_pad_type_0 = const()[name = tensor<string, []>("op_7646_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5423,14 +5423,14 @@ program(1.0)
             tensor<int32, [4]> var_7646_pad_0 = const()[name = tensor<string, []>("op_7646_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7646_dilations_0 = const()[name = tensor<string, []>("op_7646_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7646_groups_0 = const()[name = tensor<string, []>("op_7646_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_19_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(330457280))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(331505920))), name = tensor<string, []>("layers_19_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_19_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(454465280))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(456038208))), name = tensor<string, []>("layers_19_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_7646_cast_fp16 = conv(dilations = var_7646_dilations_0, groups = var_7646_groups_0, pad = var_7646_pad_0, pad_type = var_7646_pad_type_0, strides = var_7646_strides_0, weight = layers_19_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_519_cast_fp16)[name = tensor<string, []>("op_7646_cast_fp16")];
             tensor<string, []> var_7652_pad_type_0 = const()[name = tensor<string, []>("op_7652_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7652_strides_0 = const()[name = tensor<string, []>("op_7652_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7652_pad_0 = const()[name = tensor<string, []>("op_7652_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7652_dilations_0 = const()[name = tensor<string, []>("op_7652_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7652_groups_0 = const()[name = tensor<string, []>("op_7652_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_19_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(331572608))), name = tensor<string, []>("layers_19_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33223]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(331506048))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_19_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(456104960))), name = tensor<string, []>("layers_19_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33223]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(456038400))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_7652_cast_fp16 = conv(dilations = var_7652_dilations_0, groups = var_7652_groups_0, pad = var_7652_pad_0, pad_type = var_7652_pad_type_0, strides = var_7652_strides_0, weight = layers_19_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_519_cast_fp16)[name = tensor<string, []>("op_7652_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_521_cast_fp16 = add(x = var_7646_cast_fp16, y = var_7652_cast_fp16)[name = tensor<string, []>("input_521_cast_fp16")];
             tensor<int32, []> input_523_split_num_splits_0 = const()[name = tensor<string, []>("input_523_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -5443,8 +5443,8 @@ program(1.0)
             tensor<int32, []> input_525_groups_0 = const()[name = tensor<string, []>("input_525_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_525_strides_0 = const()[name = tensor<string, []>("input_525_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_525_dilations_0 = const()[name = tensor<string, []>("input_525_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_306_to_fp16 = const()[name = tensor<string, []>("const_306_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(331834816)))];
-            tensor<fp16, [1024]> const_307_to_fp16 = const()[name = tensor<string, []>("const_307_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(331853312)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_306_to_fp16 = const()[name = tensor<string, []>("const_306_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(456367168)))];
+            tensor<fp16, [1024]> const_307_to_fp16 = const()[name = tensor<string, []>("const_307_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(456385664)))];
             tensor<fp16, [1, 1024, 1, 188]> input_527_cast_fp16 = conv(bias = const_307_to_fp16, dilations = input_525_dilations_0, groups = input_525_groups_0, pad = input_525_pad_0, pad_type = input_525_pad_type_0, strides = input_525_strides_0, weight = const_306_to_fp16, x = input_523_cast_fp16)[name = tensor<string, []>("input_527_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_529_cast_fp16 = silu(x = input_527_cast_fp16)[name = tensor<string, []>("input_529_cast_fp16")];
             tensor<string, []> var_7674_pad_type_0 = const()[name = tensor<string, []>("op_7674_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5452,22 +5452,22 @@ program(1.0)
             tensor<int32, [4]> var_7674_pad_0 = const()[name = tensor<string, []>("op_7674_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7674_dilations_0 = const()[name = tensor<string, []>("op_7674_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7674_groups_0 = const()[name = tensor<string, []>("op_7674_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(331855424))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(332379776))), name = tensor<string, []>("layers_19_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(456387776))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(457174272))), name = tensor<string, []>("layers_19_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7674_cast_fp16 = conv(dilations = var_7674_dilations_0, groups = var_7674_groups_0, pad = var_7674_pad_0, pad_type = var_7674_pad_type_0, strides = var_7674_strides_0, weight = layers_19_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_529_cast_fp16)[name = tensor<string, []>("op_7674_cast_fp16")];
             tensor<string, []> var_7680_pad_type_0 = const()[name = tensor<string, []>("op_7680_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7680_strides_0 = const()[name = tensor<string, []>("op_7680_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7680_pad_0 = const()[name = tensor<string, []>("op_7680_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7680_dilations_0 = const()[name = tensor<string, []>("op_7680_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7680_groups_0 = const()[name = tensor<string, []>("op_7680_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_19_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(332410560))), name = tensor<string, []>("layers_19_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15294]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(332379904))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_19_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(457205120))), name = tensor<string, []>("layers_19_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15294]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(457174464))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7680_cast_fp16 = conv(dilations = var_7680_dilations_0, groups = var_7680_groups_0, pad = var_7680_pad_0, pad_type = var_7680_pad_type_0, strides = var_7680_strides_0, weight = layers_19_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_529_cast_fp16)[name = tensor<string, []>("op_7680_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_119_cast_fp16 = add(x = var_7674_cast_fp16, y = var_7680_cast_fp16)[name = tensor<string, []>("x_119_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_197_cast_fp16 = add(x = inputs_195_cast_fp16, y = x_119_cast_fp16)[name = tensor<string, []>("inputs_197_cast_fp16")];
             tensor<int32, [1]> out_197_axes_0 = const()[name = tensor<string, []>("out_197_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7691_to_fp16 = const()[name = tensor<string, []>("op_7691_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_197_cast_fp16 = layer_norm(axes = out_197_axes_0, epsilon = var_7691_to_fp16, x = inputs_197_cast_fp16)[name = tensor<string, []>("out_197_cast_fp16")];
-            tensor<fp16, [1024]> input_531_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_531_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(332541696)))];
-            tensor<fp16, [1024]> input_531_beta_0_to_fp16 = const()[name = tensor<string, []>("input_531_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(332543808)))];
+            tensor<fp16, [1024]> input_531_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_531_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(457336256)))];
+            tensor<fp16, [1024]> input_531_beta_0_to_fp16 = const()[name = tensor<string, []>("input_531_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(457338368)))];
             tensor<fp16, []> input_531_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_531_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_531_cast_fp16 = batch_norm(beta = input_531_beta_0_to_fp16, epsilon = input_531_epsilon_0_to_fp16, gamma = input_531_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_197_cast_fp16)[name = tensor<string, []>("input_531_cast_fp16")];
             tensor<string, []> var_7711_pad_type_0 = const()[name = tensor<string, []>("op_7711_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5475,14 +5475,14 @@ program(1.0)
             tensor<int32, [4]> var_7711_pad_0 = const()[name = tensor<string, []>("op_7711_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7711_dilations_0 = const()[name = tensor<string, []>("op_7711_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7711_groups_0 = const()[name = tensor<string, []>("op_7711_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_19_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(332545920))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(334643136))), name = tensor<string, []>("layers_19_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_19_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(457340480))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(460486272))), name = tensor<string, []>("layers_19_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7711_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_7711_dilations_0, groups = var_7711_groups_0, pad = var_7711_pad_0, pad_type = var_7711_pad_type_0, strides = var_7711_strides_0, weight = layers_19_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_531_cast_fp16)[name = tensor<string, []>("op_7711_cast_fp16")];
             tensor<string, []> var_7717_pad_type_0 = const()[name = tensor<string, []>("op_7717_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7717_strides_0 = const()[name = tensor<string, []>("op_7717_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7717_pad_0 = const()[name = tensor<string, []>("op_7717_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7717_dilations_0 = const()[name = tensor<string, []>("op_7717_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7717_groups_0 = const()[name = tensor<string, []>("op_7717_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_19_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(334770944))), name = tensor<string, []>("layers_19_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [63800]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(334643264))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_19_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(460614144))), name = tensor<string, []>("layers_19_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [63800]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(460486464))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7717_cast_fp16 = conv(dilations = var_7717_dilations_0, groups = var_7717_groups_0, pad = var_7717_pad_0, pad_type = var_7717_pad_type_0, strides = var_7717_strides_0, weight = layers_19_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_531_cast_fp16)[name = tensor<string, []>("op_7717_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_533_cast_fp16 = add(x = var_7711_cast_fp16, y = var_7717_cast_fp16)[name = tensor<string, []>("input_533_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_535_cast_fp16 = silu(x = input_533_cast_fp16)[name = tensor<string, []>("input_535_cast_fp16")];
@@ -5491,14 +5491,14 @@ program(1.0)
             tensor<int32, [4]> var_7728_pad_0 = const()[name = tensor<string, []>("op_7728_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7728_dilations_0 = const()[name = tensor<string, []>("op_7728_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7728_groups_0 = const()[name = tensor<string, []>("op_7728_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_19_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(335295296))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(337392512))), name = tensor<string, []>("layers_19_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_19_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(461138496))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(464284288))), name = tensor<string, []>("layers_19_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7728_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7728_dilations_0, groups = var_7728_groups_0, pad = var_7728_pad_0, pad_type = var_7728_pad_type_0, strides = var_7728_strides_0, weight = layers_19_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_535_cast_fp16)[name = tensor<string, []>("op_7728_cast_fp16")];
             tensor<string, []> var_7734_pad_type_0 = const()[name = tensor<string, []>("op_7734_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7734_strides_0 = const()[name = tensor<string, []>("op_7734_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7734_pad_0 = const()[name = tensor<string, []>("op_7734_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7734_dilations_0 = const()[name = tensor<string, []>("op_7734_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7734_groups_0 = const()[name = tensor<string, []>("op_7734_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_19_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(337531520))), name = tensor<string, []>("layers_19_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69399]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(337392640))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_19_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(464423360))), name = tensor<string, []>("layers_19_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69399]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(464284480))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7734_cast_fp16 = conv(dilations = var_7734_dilations_0, groups = var_7734_groups_0, pad = var_7734_pad_0, pad_type = var_7734_pad_type_0, strides = var_7734_strides_0, weight = layers_19_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_535_cast_fp16)[name = tensor<string, []>("op_7734_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_121_cast_fp16 = add(x = var_7728_cast_fp16, y = var_7734_cast_fp16)[name = tensor<string, []>("x_121_cast_fp16")];
             tensor<fp16, []> var_7736_to_fp16 = const()[name = tensor<string, []>("op_7736_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -5507,16 +5507,16 @@ program(1.0)
             tensor<int32, [1]> out_199_axes_0 = const()[name = tensor<string, []>("out_199_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7747_to_fp16 = const()[name = tensor<string, []>("op_7747_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_199_cast_fp16 = layer_norm(axes = out_199_axes_0, epsilon = var_7747_to_fp16, x = inputs_199_cast_fp16)[name = tensor<string, []>("out_199_cast_fp16")];
-            tensor<fp16, [1024]> inputs_201_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_201_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(338055872)))];
-            tensor<fp16, [1024]> inputs_201_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_201_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(338057984)))];
+            tensor<fp16, [1024]> inputs_201_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_201_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(464947712)))];
+            tensor<fp16, [1024]> inputs_201_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_201_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(464949824)))];
             tensor<fp16, []> inputs_201_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_201_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_201_cast_fp16 = batch_norm(beta = inputs_201_beta_0_to_fp16, epsilon = inputs_201_epsilon_0_to_fp16, gamma = inputs_201_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_199_cast_fp16)[name = tensor<string, []>("inputs_201_cast_fp16")];
             tensor<int32, []> var_7761 = const()[name = tensor<string, []>("op_7761"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_201_axes_0 = const()[name = tensor<string, []>("out_201_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7792_to_fp16 = const()[name = tensor<string, []>("op_7792_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_201_cast_fp16 = layer_norm(axes = out_201_axes_0, epsilon = var_7792_to_fp16, x = inputs_201_cast_fp16)[name = tensor<string, []>("out_201_cast_fp16")];
-            tensor<fp16, [1024]> input_537_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_537_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(338060096)))];
-            tensor<fp16, [1024]> input_537_beta_0_to_fp16 = const()[name = tensor<string, []>("input_537_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(338062208)))];
+            tensor<fp16, [1024]> input_537_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_537_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(464951936)))];
+            tensor<fp16, [1024]> input_537_beta_0_to_fp16 = const()[name = tensor<string, []>("input_537_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(464954048)))];
             tensor<fp16, []> input_537_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_537_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_537_cast_fp16 = batch_norm(beta = input_537_beta_0_to_fp16, epsilon = input_537_epsilon_0_to_fp16, gamma = input_537_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_201_cast_fp16)[name = tensor<string, []>("input_537_cast_fp16")];
             tensor<string, []> var_7812_pad_type_0 = const()[name = tensor<string, []>("op_7812_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5524,14 +5524,14 @@ program(1.0)
             tensor<int32, [4]> var_7812_pad_0 = const()[name = tensor<string, []>("op_7812_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7812_dilations_0 = const()[name = tensor<string, []>("op_7812_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7812_groups_0 = const()[name = tensor<string, []>("op_7812_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_20_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(338064320))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(340161536))), name = tensor<string, []>("layers_20_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_20_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(464956160))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(468101952))), name = tensor<string, []>("layers_20_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7812_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_7812_dilations_0, groups = var_7812_groups_0, pad = var_7812_pad_0, pad_type = var_7812_pad_type_0, strides = var_7812_strides_0, weight = layers_20_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_537_cast_fp16)[name = tensor<string, []>("op_7812_cast_fp16")];
             tensor<string, []> var_7818_pad_type_0 = const()[name = tensor<string, []>("op_7818_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7818_strides_0 = const()[name = tensor<string, []>("op_7818_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7818_pad_0 = const()[name = tensor<string, []>("op_7818_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7818_dilations_0 = const()[name = tensor<string, []>("op_7818_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7818_groups_0 = const()[name = tensor<string, []>("op_7818_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_20_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(340291584))), name = tensor<string, []>("layers_20_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [64915]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(340161664))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_20_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(468232064))), name = tensor<string, []>("layers_20_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [64915]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(468102144))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_7818_cast_fp16 = conv(dilations = var_7818_dilations_0, groups = var_7818_groups_0, pad = var_7818_pad_0, pad_type = var_7818_pad_type_0, strides = var_7818_strides_0, weight = layers_20_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_537_cast_fp16)[name = tensor<string, []>("op_7818_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_539_cast_fp16 = add(x = var_7812_cast_fp16, y = var_7818_cast_fp16)[name = tensor<string, []>("input_539_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_541_cast_fp16 = silu(x = input_539_cast_fp16)[name = tensor<string, []>("input_541_cast_fp16")];
@@ -5540,14 +5540,14 @@ program(1.0)
             tensor<int32, [4]> var_7829_pad_0 = const()[name = tensor<string, []>("op_7829_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7829_dilations_0 = const()[name = tensor<string, []>("op_7829_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7829_groups_0 = const()[name = tensor<string, []>("op_7829_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_20_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(340815936))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(342913152))), name = tensor<string, []>("layers_20_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_20_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(468756416))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(471902208))), name = tensor<string, []>("layers_20_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7829_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7829_dilations_0, groups = var_7829_groups_0, pad = var_7829_pad_0, pad_type = var_7829_pad_type_0, strides = var_7829_strides_0, weight = layers_20_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_541_cast_fp16)[name = tensor<string, []>("op_7829_cast_fp16")];
             tensor<string, []> var_7835_pad_type_0 = const()[name = tensor<string, []>("op_7835_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7835_strides_0 = const()[name = tensor<string, []>("op_7835_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7835_pad_0 = const()[name = tensor<string, []>("op_7835_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7835_dilations_0 = const()[name = tensor<string, []>("op_7835_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7835_groups_0 = const()[name = tensor<string, []>("op_7835_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_20_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(343060736))), name = tensor<string, []>("layers_20_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [73682]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(342913280))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_20_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(472049856))), name = tensor<string, []>("layers_20_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [73682]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(471902400))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7835_cast_fp16 = conv(dilations = var_7835_dilations_0, groups = var_7835_groups_0, pad = var_7835_pad_0, pad_type = var_7835_pad_type_0, strides = var_7835_strides_0, weight = layers_20_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_541_cast_fp16)[name = tensor<string, []>("op_7835_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_123_cast_fp16 = add(x = var_7829_cast_fp16, y = var_7835_cast_fp16)[name = tensor<string, []>("x_123_cast_fp16")];
             tensor<fp16, []> var_7837_to_fp16 = const()[name = tensor<string, []>("op_7837_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -5556,8 +5556,8 @@ program(1.0)
             tensor<int32, [1]> out_203_axes_0 = const()[name = tensor<string, []>("out_203_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_7848_to_fp16 = const()[name = tensor<string, []>("op_7848_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_203_cast_fp16 = layer_norm(axes = out_203_axes_0, epsilon = var_7848_to_fp16, x = inputs_203_cast_fp16)[name = tensor<string, []>("out_203_cast_fp16")];
-            tensor<fp16, [1024]> obj_83_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_83_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(343585088)))];
-            tensor<fp16, [1024]> obj_83_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_83_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(343587200)))];
+            tensor<fp16, [1024]> obj_83_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_83_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(472574208)))];
+            tensor<fp16, [1024]> obj_83_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_83_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(472576320)))];
             tensor<fp16, []> obj_83_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_83_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_83_cast_fp16 = batch_norm(beta = obj_83_beta_0_to_fp16, epsilon = obj_83_epsilon_0_to_fp16, gamma = obj_83_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_203_cast_fp16)[name = tensor<string, []>("obj_83_cast_fp16")];
             tensor<string, []> var_7873_pad_type_0 = const()[name = tensor<string, []>("op_7873_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5565,14 +5565,14 @@ program(1.0)
             tensor<int32, [4]> var_7873_pad_0 = const()[name = tensor<string, []>("op_7873_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7873_dilations_0 = const()[name = tensor<string, []>("op_7873_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7873_groups_0 = const()[name = tensor<string, []>("op_7873_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(343589312))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(344113664))), name = tensor<string, []>("layers_20_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(472578432))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(473364928))), name = tensor<string, []>("layers_20_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7873_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7873_dilations_0, groups = var_7873_groups_0, pad = var_7873_pad_0, pad_type = var_7873_pad_type_0, strides = var_7873_strides_0, weight = layers_20_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_83_cast_fp16)[name = tensor<string, []>("op_7873_cast_fp16")];
             tensor<string, []> var_7879_pad_type_0 = const()[name = tensor<string, []>("op_7879_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7879_strides_0 = const()[name = tensor<string, []>("op_7879_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7879_pad_0 = const()[name = tensor<string, []>("op_7879_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7879_dilations_0 = const()[name = tensor<string, []>("op_7879_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7879_groups_0 = const()[name = tensor<string, []>("op_7879_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(344144064))), name = tensor<string, []>("layers_20_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15097]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(344113792))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(473395392))), name = tensor<string, []>("layers_20_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15097]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(473365120))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7879_cast_fp16 = conv(dilations = var_7879_dilations_0, groups = var_7879_groups_0, pad = var_7879_pad_0, pad_type = var_7879_pad_type_0, strides = var_7879_strides_0, weight = layers_20_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_83_cast_fp16)[name = tensor<string, []>("op_7879_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_81_cast_fp16 = add(x = var_7873_cast_fp16, y = var_7879_cast_fp16)[name = tensor<string, []>("query_81_cast_fp16")];
             tensor<string, []> var_7888_pad_type_0 = const()[name = tensor<string, []>("op_7888_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5580,14 +5580,14 @@ program(1.0)
             tensor<int32, [4]> var_7888_pad_0 = const()[name = tensor<string, []>("op_7888_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7888_dilations_0 = const()[name = tensor<string, []>("op_7888_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7888_groups_0 = const()[name = tensor<string, []>("op_7888_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(344275200))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(344799552))), name = tensor<string, []>("layers_20_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(473526528))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(474313024))), name = tensor<string, []>("layers_20_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7888_cast_fp16 = conv(dilations = var_7888_dilations_0, groups = var_7888_groups_0, pad = var_7888_pad_0, pad_type = var_7888_pad_type_0, strides = var_7888_strides_0, weight = layers_20_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_83_cast_fp16)[name = tensor<string, []>("op_7888_cast_fp16")];
             tensor<string, []> var_7894_pad_type_0 = const()[name = tensor<string, []>("op_7894_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7894_strides_0 = const()[name = tensor<string, []>("op_7894_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7894_pad_0 = const()[name = tensor<string, []>("op_7894_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7894_dilations_0 = const()[name = tensor<string, []>("op_7894_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7894_groups_0 = const()[name = tensor<string, []>("op_7894_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(344831360))), name = tensor<string, []>("layers_20_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15787]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(344799680))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(474344896))), name = tensor<string, []>("layers_20_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15787]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(474313216))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7894_cast_fp16 = conv(dilations = var_7894_dilations_0, groups = var_7894_groups_0, pad = var_7894_pad_0, pad_type = var_7894_pad_type_0, strides = var_7894_strides_0, weight = layers_20_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_83_cast_fp16)[name = tensor<string, []>("op_7894_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_41_cast_fp16 = add(x = var_7888_cast_fp16, y = var_7894_cast_fp16)[name = tensor<string, []>("key_41_cast_fp16")];
             tensor<string, []> var_7904_pad_type_0 = const()[name = tensor<string, []>("op_7904_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5595,33 +5595,33 @@ program(1.0)
             tensor<int32, [4]> var_7904_pad_0 = const()[name = tensor<string, []>("op_7904_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7904_dilations_0 = const()[name = tensor<string, []>("op_7904_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7904_groups_0 = const()[name = tensor<string, []>("op_7904_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(344962496))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(345486848))), name = tensor<string, []>("layers_20_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(474476032))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(475262528))), name = tensor<string, []>("layers_20_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7904_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7904_dilations_0, groups = var_7904_groups_0, pad = var_7904_pad_0, pad_type = var_7904_pad_type_0, strides = var_7904_strides_0, weight = layers_20_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_83_cast_fp16)[name = tensor<string, []>("op_7904_cast_fp16")];
             tensor<string, []> var_7910_pad_type_0 = const()[name = tensor<string, []>("op_7910_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7910_strides_0 = const()[name = tensor<string, []>("op_7910_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7910_pad_0 = const()[name = tensor<string, []>("op_7910_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7910_dilations_0 = const()[name = tensor<string, []>("op_7910_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7910_groups_0 = const()[name = tensor<string, []>("op_7910_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(345515648))), name = tensor<string, []>("layers_20_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14287]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(345486976))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(475291392))), name = tensor<string, []>("layers_20_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14287]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(475262720))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7910_cast_fp16 = conv(dilations = var_7910_dilations_0, groups = var_7910_groups_0, pad = var_7910_pad_0, pad_type = var_7910_pad_type_0, strides = var_7910_strides_0, weight = layers_20_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_83_cast_fp16)[name = tensor<string, []>("op_7910_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_41_cast_fp16 = add(x = var_7904_cast_fp16, y = var_7910_cast_fp16)[name = tensor<string, []>("value_41_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_7913_to_fp16 = const()[name = tensor<string, []>("op_7913_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(345646784)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_7913_to_fp16 = const()[name = tensor<string, []>("op_7913_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(475422528)))];
             tensor<fp16, [1, 1024, 1, 188]> query_83_cast_fp16 = add(x = query_81_cast_fp16, y = var_7913_to_fp16)[name = tensor<string, []>("query_83_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_7916_to_fp16 = const()[name = tensor<string, []>("op_7916_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(345648896)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_7916_to_fp16 = const()[name = tensor<string, []>("op_7916_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(475424640)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_41_cast_fp16 = add(x = query_81_cast_fp16, y = var_7916_to_fp16)[name = tensor<string, []>("q_with_bias_v_41_cast_fp16")];
             tensor<string, []> var_7926_pad_type_0 = const()[name = tensor<string, []>("op_7926_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7926_strides_0 = const()[name = tensor<string, []>("op_7926_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7926_pad_0 = const()[name = tensor<string, []>("op_7926_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7926_dilations_0 = const()[name = tensor<string, []>("op_7926_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7926_groups_0 = const()[name = tensor<string, []>("op_7926_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(345651008))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(346175360))), name = tensor<string, []>("layers_20_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(475426752))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(476213248))), name = tensor<string, []>("layers_20_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_7926_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7926_dilations_0, groups = var_7926_groups_0, pad = var_7926_pad_0, pad_type = var_7926_pad_type_0, strides = var_7926_strides_0, weight = layers_20_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_7926_cast_fp16")];
             tensor<string, []> var_7932_pad_type_0 = const()[name = tensor<string, []>("op_7932_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7932_strides_0 = const()[name = tensor<string, []>("op_7932_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7932_pad_0 = const()[name = tensor<string, []>("op_7932_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7932_dilations_0 = const()[name = tensor<string, []>("op_7932_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7932_groups_0 = const()[name = tensor<string, []>("op_7932_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(346245824))), name = tensor<string, []>("layers_20_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [35133]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(346175488))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(476283776))), name = tensor<string, []>("layers_20_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [35133]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(476213440))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_7932_cast_fp16 = conv(dilations = var_7932_dilations_0, groups = var_7932_groups_0, pad = var_7932_pad_0, pad_type = var_7932_pad_type_0, strides = var_7932_strides_0, weight = layers_20_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_7932_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_41_cast_fp16 = add(x = var_7926_cast_fp16, y = var_7932_cast_fp16)[name = tensor<string, []>("p_41_cast_fp16")];
             tensor<int32, [4]> var_7936 = const()[name = tensor<string, []>("op_7936"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -5672,22 +5672,22 @@ program(1.0)
             tensor<int32, [4]> var_7989_pad_0 = const()[name = tensor<string, []>("op_7989_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7989_dilations_0 = const()[name = tensor<string, []>("op_7989_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7989_groups_0 = const()[name = tensor<string, []>("op_7989_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(346376960))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(346901312))), name = tensor<string, []>("layers_20_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(476414912))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(477201408))), name = tensor<string, []>("layers_20_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7989_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_7989_dilations_0, groups = var_7989_groups_0, pad = var_7989_pad_0, pad_type = var_7989_pad_type_0, strides = var_7989_strides_0, weight = layers_20_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_543_cast_fp16)[name = tensor<string, []>("op_7989_cast_fp16")];
             tensor<string, []> var_7995_pad_type_0 = const()[name = tensor<string, []>("op_7995_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_7995_strides_0 = const()[name = tensor<string, []>("op_7995_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_7995_pad_0 = const()[name = tensor<string, []>("op_7995_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_7995_dilations_0 = const()[name = tensor<string, []>("op_7995_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_7995_groups_0 = const()[name = tensor<string, []>("op_7995_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(346930880))), name = tensor<string, []>("layers_20_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14672]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(346901440))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(477231040))), name = tensor<string, []>("layers_20_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14672]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(477201600))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_7995_cast_fp16 = conv(dilations = var_7995_dilations_0, groups = var_7995_groups_0, pad = var_7995_pad_0, pad_type = var_7995_pad_type_0, strides = var_7995_strides_0, weight = layers_20_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_543_cast_fp16)[name = tensor<string, []>("op_7995_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_85_cast_fp16 = add(x = var_7989_cast_fp16, y = var_7995_cast_fp16)[name = tensor<string, []>("obj_85_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_205_cast_fp16 = add(x = inputs_203_cast_fp16, y = obj_85_cast_fp16)[name = tensor<string, []>("inputs_205_cast_fp16")];
             tensor<int32, [1]> out_205_axes_0 = const()[name = tensor<string, []>("out_205_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8006_to_fp16 = const()[name = tensor<string, []>("op_8006_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_205_cast_fp16 = layer_norm(axes = out_205_axes_0, epsilon = var_8006_to_fp16, x = inputs_205_cast_fp16)[name = tensor<string, []>("out_205_cast_fp16")];
-            tensor<fp16, [1024]> input_545_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_545_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(347062016)))];
-            tensor<fp16, [1024]> input_545_beta_0_to_fp16 = const()[name = tensor<string, []>("input_545_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(347064128)))];
+            tensor<fp16, [1024]> input_545_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_545_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(477362176)))];
+            tensor<fp16, [1024]> input_545_beta_0_to_fp16 = const()[name = tensor<string, []>("input_545_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(477364288)))];
             tensor<fp16, []> input_545_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_545_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_545_cast_fp16 = batch_norm(beta = input_545_beta_0_to_fp16, epsilon = input_545_epsilon_0_to_fp16, gamma = input_545_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_205_cast_fp16)[name = tensor<string, []>("input_545_cast_fp16")];
             tensor<string, []> var_8027_pad_type_0 = const()[name = tensor<string, []>("op_8027_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5695,14 +5695,14 @@ program(1.0)
             tensor<int32, [4]> var_8027_pad_0 = const()[name = tensor<string, []>("op_8027_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8027_dilations_0 = const()[name = tensor<string, []>("op_8027_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8027_groups_0 = const()[name = tensor<string, []>("op_8027_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_20_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(347066240))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(348114880))), name = tensor<string, []>("layers_20_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_20_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(477366400))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(478939328))), name = tensor<string, []>("layers_20_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_8027_cast_fp16 = conv(dilations = var_8027_dilations_0, groups = var_8027_groups_0, pad = var_8027_pad_0, pad_type = var_8027_pad_type_0, strides = var_8027_strides_0, weight = layers_20_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_545_cast_fp16)[name = tensor<string, []>("op_8027_cast_fp16")];
             tensor<string, []> var_8033_pad_type_0 = const()[name = tensor<string, []>("op_8033_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8033_strides_0 = const()[name = tensor<string, []>("op_8033_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8033_pad_0 = const()[name = tensor<string, []>("op_8033_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8033_dilations_0 = const()[name = tensor<string, []>("op_8033_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8033_groups_0 = const()[name = tensor<string, []>("op_8033_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_20_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(348180032))), name = tensor<string, []>("layers_20_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32457]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(348115008))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_20_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(479004544))), name = tensor<string, []>("layers_20_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [32457]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(478939520))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_8033_cast_fp16 = conv(dilations = var_8033_dilations_0, groups = var_8033_groups_0, pad = var_8033_pad_0, pad_type = var_8033_pad_type_0, strides = var_8033_strides_0, weight = layers_20_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_545_cast_fp16)[name = tensor<string, []>("op_8033_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_547_cast_fp16 = add(x = var_8027_cast_fp16, y = var_8033_cast_fp16)[name = tensor<string, []>("input_547_cast_fp16")];
             tensor<int32, []> input_549_split_num_splits_0 = const()[name = tensor<string, []>("input_549_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -5715,8 +5715,8 @@ program(1.0)
             tensor<int32, []> input_551_groups_0 = const()[name = tensor<string, []>("input_551_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_551_strides_0 = const()[name = tensor<string, []>("input_551_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_551_dilations_0 = const()[name = tensor<string, []>("input_551_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_308_to_fp16 = const()[name = tensor<string, []>("const_308_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(348442240)))];
-            tensor<fp16, [1024]> const_309_to_fp16 = const()[name = tensor<string, []>("const_309_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(348460736)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_308_to_fp16 = const()[name = tensor<string, []>("const_308_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(479266752)))];
+            tensor<fp16, [1024]> const_309_to_fp16 = const()[name = tensor<string, []>("const_309_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(479285248)))];
             tensor<fp16, [1, 1024, 1, 188]> input_553_cast_fp16 = conv(bias = const_309_to_fp16, dilations = input_551_dilations_0, groups = input_551_groups_0, pad = input_551_pad_0, pad_type = input_551_pad_type_0, strides = input_551_strides_0, weight = const_308_to_fp16, x = input_549_cast_fp16)[name = tensor<string, []>("input_553_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_555_cast_fp16 = silu(x = input_553_cast_fp16)[name = tensor<string, []>("input_555_cast_fp16")];
             tensor<string, []> var_8055_pad_type_0 = const()[name = tensor<string, []>("op_8055_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5724,22 +5724,22 @@ program(1.0)
             tensor<int32, [4]> var_8055_pad_0 = const()[name = tensor<string, []>("op_8055_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8055_dilations_0 = const()[name = tensor<string, []>("op_8055_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8055_groups_0 = const()[name = tensor<string, []>("op_8055_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(348462848))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(348987200))), name = tensor<string, []>("layers_20_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(479287360))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(480073856))), name = tensor<string, []>("layers_20_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8055_cast_fp16 = conv(dilations = var_8055_dilations_0, groups = var_8055_groups_0, pad = var_8055_pad_0, pad_type = var_8055_pad_type_0, strides = var_8055_strides_0, weight = layers_20_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_555_cast_fp16)[name = tensor<string, []>("op_8055_cast_fp16")];
             tensor<string, []> var_8061_pad_type_0 = const()[name = tensor<string, []>("op_8061_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8061_strides_0 = const()[name = tensor<string, []>("op_8061_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8061_pad_0 = const()[name = tensor<string, []>("op_8061_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8061_dilations_0 = const()[name = tensor<string, []>("op_8061_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8061_groups_0 = const()[name = tensor<string, []>("op_8061_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_20_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(349019264))), name = tensor<string, []>("layers_20_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15933]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(348987328))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_20_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(480105984))), name = tensor<string, []>("layers_20_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15933]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(480074048))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8061_cast_fp16 = conv(dilations = var_8061_dilations_0, groups = var_8061_groups_0, pad = var_8061_pad_0, pad_type = var_8061_pad_type_0, strides = var_8061_strides_0, weight = layers_20_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_555_cast_fp16)[name = tensor<string, []>("op_8061_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_125_cast_fp16 = add(x = var_8055_cast_fp16, y = var_8061_cast_fp16)[name = tensor<string, []>("x_125_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_207_cast_fp16 = add(x = inputs_205_cast_fp16, y = x_125_cast_fp16)[name = tensor<string, []>("inputs_207_cast_fp16")];
             tensor<int32, [1]> out_207_axes_0 = const()[name = tensor<string, []>("out_207_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8072_to_fp16 = const()[name = tensor<string, []>("op_8072_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_207_cast_fp16 = layer_norm(axes = out_207_axes_0, epsilon = var_8072_to_fp16, x = inputs_207_cast_fp16)[name = tensor<string, []>("out_207_cast_fp16")];
-            tensor<fp16, [1024]> input_557_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_557_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(349150400)))];
-            tensor<fp16, [1024]> input_557_beta_0_to_fp16 = const()[name = tensor<string, []>("input_557_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(349152512)))];
+            tensor<fp16, [1024]> input_557_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_557_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(480237120)))];
+            tensor<fp16, [1024]> input_557_beta_0_to_fp16 = const()[name = tensor<string, []>("input_557_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(480239232)))];
             tensor<fp16, []> input_557_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_557_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_557_cast_fp16 = batch_norm(beta = input_557_beta_0_to_fp16, epsilon = input_557_epsilon_0_to_fp16, gamma = input_557_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_207_cast_fp16)[name = tensor<string, []>("input_557_cast_fp16")];
             tensor<string, []> var_8092_pad_type_0 = const()[name = tensor<string, []>("op_8092_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5747,14 +5747,14 @@ program(1.0)
             tensor<int32, [4]> var_8092_pad_0 = const()[name = tensor<string, []>("op_8092_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8092_dilations_0 = const()[name = tensor<string, []>("op_8092_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8092_groups_0 = const()[name = tensor<string, []>("op_8092_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_20_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(349154624))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(351251840))), name = tensor<string, []>("layers_20_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_20_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(480241344))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(483387136))), name = tensor<string, []>("layers_20_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8092_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_8092_dilations_0, groups = var_8092_groups_0, pad = var_8092_pad_0, pad_type = var_8092_pad_type_0, strides = var_8092_strides_0, weight = layers_20_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_557_cast_fp16)[name = tensor<string, []>("op_8092_cast_fp16")];
             tensor<string, []> var_8098_pad_type_0 = const()[name = tensor<string, []>("op_8098_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8098_strides_0 = const()[name = tensor<string, []>("op_8098_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8098_pad_0 = const()[name = tensor<string, []>("op_8098_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8098_dilations_0 = const()[name = tensor<string, []>("op_8098_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8098_groups_0 = const()[name = tensor<string, []>("op_8098_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_20_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(351378048))), name = tensor<string, []>("layers_20_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [63002]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(351251968))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_20_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(483513408))), name = tensor<string, []>("layers_20_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [63002]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(483387328))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8098_cast_fp16 = conv(dilations = var_8098_dilations_0, groups = var_8098_groups_0, pad = var_8098_pad_0, pad_type = var_8098_pad_type_0, strides = var_8098_strides_0, weight = layers_20_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_557_cast_fp16)[name = tensor<string, []>("op_8098_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_559_cast_fp16 = add(x = var_8092_cast_fp16, y = var_8098_cast_fp16)[name = tensor<string, []>("input_559_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_561_cast_fp16 = silu(x = input_559_cast_fp16)[name = tensor<string, []>("input_561_cast_fp16")];
@@ -5763,14 +5763,14 @@ program(1.0)
             tensor<int32, [4]> var_8109_pad_0 = const()[name = tensor<string, []>("op_8109_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8109_dilations_0 = const()[name = tensor<string, []>("op_8109_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8109_groups_0 = const()[name = tensor<string, []>("op_8109_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_20_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(351902400))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(353999616))), name = tensor<string, []>("layers_20_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_20_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(484037760))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(487183552))), name = tensor<string, []>("layers_20_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8109_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8109_dilations_0, groups = var_8109_groups_0, pad = var_8109_pad_0, pad_type = var_8109_pad_type_0, strides = var_8109_strides_0, weight = layers_20_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_561_cast_fp16)[name = tensor<string, []>("op_8109_cast_fp16")];
             tensor<string, []> var_8115_pad_type_0 = const()[name = tensor<string, []>("op_8115_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8115_strides_0 = const()[name = tensor<string, []>("op_8115_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8115_pad_0 = const()[name = tensor<string, []>("op_8115_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8115_dilations_0 = const()[name = tensor<string, []>("op_8115_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8115_groups_0 = const()[name = tensor<string, []>("op_8115_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_20_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(354144768))), name = tensor<string, []>("layers_20_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72470]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(353999744))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_20_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(487328768))), name = tensor<string, []>("layers_20_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [72470]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(487183744))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8115_cast_fp16 = conv(dilations = var_8115_dilations_0, groups = var_8115_groups_0, pad = var_8115_pad_0, pad_type = var_8115_pad_type_0, strides = var_8115_strides_0, weight = layers_20_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_561_cast_fp16)[name = tensor<string, []>("op_8115_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_127_cast_fp16 = add(x = var_8109_cast_fp16, y = var_8115_cast_fp16)[name = tensor<string, []>("x_127_cast_fp16")];
             tensor<fp16, []> var_8117_to_fp16 = const()[name = tensor<string, []>("op_8117_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -5779,16 +5779,16 @@ program(1.0)
             tensor<int32, [1]> out_209_axes_0 = const()[name = tensor<string, []>("out_209_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8128_to_fp16 = const()[name = tensor<string, []>("op_8128_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_209_cast_fp16 = layer_norm(axes = out_209_axes_0, epsilon = var_8128_to_fp16, x = inputs_209_cast_fp16)[name = tensor<string, []>("out_209_cast_fp16")];
-            tensor<fp16, [1024]> inputs_211_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_211_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(354669120)))];
-            tensor<fp16, [1024]> inputs_211_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_211_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(354671232)))];
+            tensor<fp16, [1024]> inputs_211_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_211_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(487853120)))];
+            tensor<fp16, [1024]> inputs_211_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_211_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(487855232)))];
             tensor<fp16, []> inputs_211_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_211_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_211_cast_fp16 = batch_norm(beta = inputs_211_beta_0_to_fp16, epsilon = inputs_211_epsilon_0_to_fp16, gamma = inputs_211_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_209_cast_fp16)[name = tensor<string, []>("inputs_211_cast_fp16")];
             tensor<int32, []> var_8142 = const()[name = tensor<string, []>("op_8142"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_211_axes_0 = const()[name = tensor<string, []>("out_211_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8173_to_fp16 = const()[name = tensor<string, []>("op_8173_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_211_cast_fp16 = layer_norm(axes = out_211_axes_0, epsilon = var_8173_to_fp16, x = inputs_211_cast_fp16)[name = tensor<string, []>("out_211_cast_fp16")];
-            tensor<fp16, [1024]> input_563_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_563_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(354673344)))];
-            tensor<fp16, [1024]> input_563_beta_0_to_fp16 = const()[name = tensor<string, []>("input_563_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(354675456)))];
+            tensor<fp16, [1024]> input_563_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_563_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(487857344)))];
+            tensor<fp16, [1024]> input_563_beta_0_to_fp16 = const()[name = tensor<string, []>("input_563_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(487859456)))];
             tensor<fp16, []> input_563_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_563_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_563_cast_fp16 = batch_norm(beta = input_563_beta_0_to_fp16, epsilon = input_563_epsilon_0_to_fp16, gamma = input_563_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_211_cast_fp16)[name = tensor<string, []>("input_563_cast_fp16")];
             tensor<string, []> var_8193_pad_type_0 = const()[name = tensor<string, []>("op_8193_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5796,14 +5796,14 @@ program(1.0)
             tensor<int32, [4]> var_8193_pad_0 = const()[name = tensor<string, []>("op_8193_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8193_dilations_0 = const()[name = tensor<string, []>("op_8193_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8193_groups_0 = const()[name = tensor<string, []>("op_8193_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_21_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(354677568))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(356774784))), name = tensor<string, []>("layers_21_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_21_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(487861568))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(491007360))), name = tensor<string, []>("layers_21_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8193_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_8193_dilations_0, groups = var_8193_groups_0, pad = var_8193_pad_0, pad_type = var_8193_pad_type_0, strides = var_8193_strides_0, weight = layers_21_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_563_cast_fp16)[name = tensor<string, []>("op_8193_cast_fp16")];
             tensor<string, []> var_8199_pad_type_0 = const()[name = tensor<string, []>("op_8199_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8199_strides_0 = const()[name = tensor<string, []>("op_8199_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8199_pad_0 = const()[name = tensor<string, []>("op_8199_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8199_dilations_0 = const()[name = tensor<string, []>("op_8199_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8199_groups_0 = const()[name = tensor<string, []>("op_8199_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_21_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(356905408))), name = tensor<string, []>("layers_21_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [65193]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(356774912))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_21_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(491138048))), name = tensor<string, []>("layers_21_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [65193]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(491007552))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8199_cast_fp16 = conv(dilations = var_8199_dilations_0, groups = var_8199_groups_0, pad = var_8199_pad_0, pad_type = var_8199_pad_type_0, strides = var_8199_strides_0, weight = layers_21_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_563_cast_fp16)[name = tensor<string, []>("op_8199_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_565_cast_fp16 = add(x = var_8193_cast_fp16, y = var_8199_cast_fp16)[name = tensor<string, []>("input_565_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_567_cast_fp16 = silu(x = input_565_cast_fp16)[name = tensor<string, []>("input_567_cast_fp16")];
@@ -5812,14 +5812,14 @@ program(1.0)
             tensor<int32, [4]> var_8210_pad_0 = const()[name = tensor<string, []>("op_8210_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8210_dilations_0 = const()[name = tensor<string, []>("op_8210_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8210_groups_0 = const()[name = tensor<string, []>("op_8210_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_21_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(357429760))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(359526976))), name = tensor<string, []>("layers_21_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_21_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(491662400))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(494808192))), name = tensor<string, []>("layers_21_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8210_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8210_dilations_0, groups = var_8210_groups_0, pad = var_8210_pad_0, pad_type = var_8210_pad_type_0, strides = var_8210_strides_0, weight = layers_21_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_567_cast_fp16)[name = tensor<string, []>("op_8210_cast_fp16")];
             tensor<string, []> var_8216_pad_type_0 = const()[name = tensor<string, []>("op_8216_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8216_strides_0 = const()[name = tensor<string, []>("op_8216_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8216_pad_0 = const()[name = tensor<string, []>("op_8216_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8216_dilations_0 = const()[name = tensor<string, []>("op_8216_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8216_groups_0 = const()[name = tensor<string, []>("op_8216_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_21_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(359692288))), name = tensor<string, []>("layers_21_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82556]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(359527104))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_21_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(494973568))), name = tensor<string, []>("layers_21_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82556]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(494808384))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8216_cast_fp16 = conv(dilations = var_8216_dilations_0, groups = var_8216_groups_0, pad = var_8216_pad_0, pad_type = var_8216_pad_type_0, strides = var_8216_strides_0, weight = layers_21_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_567_cast_fp16)[name = tensor<string, []>("op_8216_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_129_cast_fp16 = add(x = var_8210_cast_fp16, y = var_8216_cast_fp16)[name = tensor<string, []>("x_129_cast_fp16")];
             tensor<fp16, []> var_8218_to_fp16 = const()[name = tensor<string, []>("op_8218_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -5828,8 +5828,8 @@ program(1.0)
             tensor<int32, [1]> out_213_axes_0 = const()[name = tensor<string, []>("out_213_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8229_to_fp16 = const()[name = tensor<string, []>("op_8229_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_213_cast_fp16 = layer_norm(axes = out_213_axes_0, epsilon = var_8229_to_fp16, x = inputs_213_cast_fp16)[name = tensor<string, []>("out_213_cast_fp16")];
-            tensor<fp16, [1024]> obj_87_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_87_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360216640)))];
-            tensor<fp16, [1024]> obj_87_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_87_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360218752)))];
+            tensor<fp16, [1024]> obj_87_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_87_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(495497920)))];
+            tensor<fp16, [1024]> obj_87_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_87_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(495500032)))];
             tensor<fp16, []> obj_87_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_87_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_87_cast_fp16 = batch_norm(beta = obj_87_beta_0_to_fp16, epsilon = obj_87_epsilon_0_to_fp16, gamma = obj_87_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_213_cast_fp16)[name = tensor<string, []>("obj_87_cast_fp16")];
             tensor<string, []> var_8254_pad_type_0 = const()[name = tensor<string, []>("op_8254_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5837,14 +5837,14 @@ program(1.0)
             tensor<int32, [4]> var_8254_pad_0 = const()[name = tensor<string, []>("op_8254_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8254_dilations_0 = const()[name = tensor<string, []>("op_8254_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8254_groups_0 = const()[name = tensor<string, []>("op_8254_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360220864))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360745216))), name = tensor<string, []>("layers_21_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(495502144))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(496288640))), name = tensor<string, []>("layers_21_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8254_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8254_dilations_0, groups = var_8254_groups_0, pad = var_8254_pad_0, pad_type = var_8254_pad_type_0, strides = var_8254_strides_0, weight = layers_21_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_87_cast_fp16)[name = tensor<string, []>("op_8254_cast_fp16")];
             tensor<string, []> var_8260_pad_type_0 = const()[name = tensor<string, []>("op_8260_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8260_strides_0 = const()[name = tensor<string, []>("op_8260_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8260_pad_0 = const()[name = tensor<string, []>("op_8260_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8260_dilations_0 = const()[name = tensor<string, []>("op_8260_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8260_groups_0 = const()[name = tensor<string, []>("op_8260_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360778432))), name = tensor<string, []>("layers_21_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16494]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360745344))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(496321920))), name = tensor<string, []>("layers_21_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16494]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(496288832))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8260_cast_fp16 = conv(dilations = var_8260_dilations_0, groups = var_8260_groups_0, pad = var_8260_pad_0, pad_type = var_8260_pad_type_0, strides = var_8260_strides_0, weight = layers_21_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_87_cast_fp16)[name = tensor<string, []>("op_8260_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_85_cast_fp16 = add(x = var_8254_cast_fp16, y = var_8260_cast_fp16)[name = tensor<string, []>("query_85_cast_fp16")];
             tensor<string, []> var_8269_pad_type_0 = const()[name = tensor<string, []>("op_8269_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5852,14 +5852,14 @@ program(1.0)
             tensor<int32, [4]> var_8269_pad_0 = const()[name = tensor<string, []>("op_8269_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8269_dilations_0 = const()[name = tensor<string, []>("op_8269_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8269_groups_0 = const()[name = tensor<string, []>("op_8269_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(360909568))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(361433920))), name = tensor<string, []>("layers_21_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(496453056))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(497239552))), name = tensor<string, []>("layers_21_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8269_cast_fp16 = conv(dilations = var_8269_dilations_0, groups = var_8269_groups_0, pad = var_8269_pad_0, pad_type = var_8269_pad_type_0, strides = var_8269_strides_0, weight = layers_21_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_87_cast_fp16)[name = tensor<string, []>("op_8269_cast_fp16")];
             tensor<string, []> var_8275_pad_type_0 = const()[name = tensor<string, []>("op_8275_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8275_strides_0 = const()[name = tensor<string, []>("op_8275_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8275_pad_0 = const()[name = tensor<string, []>("op_8275_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8275_dilations_0 = const()[name = tensor<string, []>("op_8275_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8275_groups_0 = const()[name = tensor<string, []>("op_8275_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(361466752))), name = tensor<string, []>("layers_21_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16301]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(361434048))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(497272448))), name = tensor<string, []>("layers_21_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16301]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(497239744))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8275_cast_fp16 = conv(dilations = var_8275_dilations_0, groups = var_8275_groups_0, pad = var_8275_pad_0, pad_type = var_8275_pad_type_0, strides = var_8275_strides_0, weight = layers_21_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_87_cast_fp16)[name = tensor<string, []>("op_8275_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_43_cast_fp16 = add(x = var_8269_cast_fp16, y = var_8275_cast_fp16)[name = tensor<string, []>("key_43_cast_fp16")];
             tensor<string, []> var_8285_pad_type_0 = const()[name = tensor<string, []>("op_8285_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5867,33 +5867,33 @@ program(1.0)
             tensor<int32, [4]> var_8285_pad_0 = const()[name = tensor<string, []>("op_8285_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8285_dilations_0 = const()[name = tensor<string, []>("op_8285_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8285_groups_0 = const()[name = tensor<string, []>("op_8285_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(361597888))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362122240))), name = tensor<string, []>("layers_21_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(497403584))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(498190080))), name = tensor<string, []>("layers_21_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8285_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8285_dilations_0, groups = var_8285_groups_0, pad = var_8285_pad_0, pad_type = var_8285_pad_type_0, strides = var_8285_strides_0, weight = layers_21_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_87_cast_fp16)[name = tensor<string, []>("op_8285_cast_fp16")];
             tensor<string, []> var_8291_pad_type_0 = const()[name = tensor<string, []>("op_8291_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8291_strides_0 = const()[name = tensor<string, []>("op_8291_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8291_pad_0 = const()[name = tensor<string, []>("op_8291_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8291_dilations_0 = const()[name = tensor<string, []>("op_8291_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8291_groups_0 = const()[name = tensor<string, []>("op_8291_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362151680))), name = tensor<string, []>("layers_21_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14603]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362122368))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(498219584))), name = tensor<string, []>("layers_21_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14603]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(498190272))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8291_cast_fp16 = conv(dilations = var_8291_dilations_0, groups = var_8291_groups_0, pad = var_8291_pad_0, pad_type = var_8291_pad_type_0, strides = var_8291_strides_0, weight = layers_21_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_87_cast_fp16)[name = tensor<string, []>("op_8291_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_43_cast_fp16 = add(x = var_8285_cast_fp16, y = var_8291_cast_fp16)[name = tensor<string, []>("value_43_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_8294_to_fp16 = const()[name = tensor<string, []>("op_8294_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362282816)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_8294_to_fp16 = const()[name = tensor<string, []>("op_8294_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(498350720)))];
             tensor<fp16, [1, 1024, 1, 188]> query_87_cast_fp16 = add(x = query_85_cast_fp16, y = var_8294_to_fp16)[name = tensor<string, []>("query_87_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_8297_to_fp16 = const()[name = tensor<string, []>("op_8297_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362284928)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_8297_to_fp16 = const()[name = tensor<string, []>("op_8297_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(498352832)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_43_cast_fp16 = add(x = query_85_cast_fp16, y = var_8297_to_fp16)[name = tensor<string, []>("q_with_bias_v_43_cast_fp16")];
             tensor<string, []> var_8307_pad_type_0 = const()[name = tensor<string, []>("op_8307_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8307_strides_0 = const()[name = tensor<string, []>("op_8307_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8307_pad_0 = const()[name = tensor<string, []>("op_8307_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8307_dilations_0 = const()[name = tensor<string, []>("op_8307_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8307_groups_0 = const()[name = tensor<string, []>("op_8307_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362287040))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362811392))), name = tensor<string, []>("layers_21_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(498354944))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(499141440))), name = tensor<string, []>("layers_21_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_8307_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8307_dilations_0, groups = var_8307_groups_0, pad = var_8307_pad_0, pad_type = var_8307_pad_type_0, strides = var_8307_strides_0, weight = layers_21_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_8307_cast_fp16")];
             tensor<string, []> var_8313_pad_type_0 = const()[name = tensor<string, []>("op_8313_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8313_strides_0 = const()[name = tensor<string, []>("op_8313_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8313_pad_0 = const()[name = tensor<string, []>("op_8313_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8313_dilations_0 = const()[name = tensor<string, []>("op_8313_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8313_groups_0 = const()[name = tensor<string, []>("op_8313_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362893184))), name = tensor<string, []>("layers_21_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [40787]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(362811520))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(499223296))), name = tensor<string, []>("layers_21_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [40787]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(499141632))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_8313_cast_fp16 = conv(dilations = var_8313_dilations_0, groups = var_8313_groups_0, pad = var_8313_pad_0, pad_type = var_8313_pad_type_0, strides = var_8313_strides_0, weight = layers_21_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_8313_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_43_cast_fp16 = add(x = var_8307_cast_fp16, y = var_8313_cast_fp16)[name = tensor<string, []>("p_43_cast_fp16")];
             tensor<int32, [4]> var_8317 = const()[name = tensor<string, []>("op_8317"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -5944,22 +5944,22 @@ program(1.0)
             tensor<int32, [4]> var_8370_pad_0 = const()[name = tensor<string, []>("op_8370_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8370_dilations_0 = const()[name = tensor<string, []>("op_8370_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8370_groups_0 = const()[name = tensor<string, []>("op_8370_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(363024320))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(363548672))), name = tensor<string, []>("layers_21_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(499354432))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(500140928))), name = tensor<string, []>("layers_21_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8370_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8370_dilations_0, groups = var_8370_groups_0, pad = var_8370_pad_0, pad_type = var_8370_pad_type_0, strides = var_8370_strides_0, weight = layers_21_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_569_cast_fp16)[name = tensor<string, []>("op_8370_cast_fp16")];
             tensor<string, []> var_8376_pad_type_0 = const()[name = tensor<string, []>("op_8376_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8376_strides_0 = const()[name = tensor<string, []>("op_8376_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8376_pad_0 = const()[name = tensor<string, []>("op_8376_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8376_dilations_0 = const()[name = tensor<string, []>("op_8376_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8376_groups_0 = const()[name = tensor<string, []>("op_8376_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(363578240))), name = tensor<string, []>("layers_21_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14686]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(363548800))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(500170560))), name = tensor<string, []>("layers_21_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14686]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(500141120))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8376_cast_fp16 = conv(dilations = var_8376_dilations_0, groups = var_8376_groups_0, pad = var_8376_pad_0, pad_type = var_8376_pad_type_0, strides = var_8376_strides_0, weight = layers_21_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_569_cast_fp16)[name = tensor<string, []>("op_8376_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_89_cast_fp16 = add(x = var_8370_cast_fp16, y = var_8376_cast_fp16)[name = tensor<string, []>("obj_89_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_215_cast_fp16 = add(x = inputs_213_cast_fp16, y = obj_89_cast_fp16)[name = tensor<string, []>("inputs_215_cast_fp16")];
             tensor<int32, [1]> out_215_axes_0 = const()[name = tensor<string, []>("out_215_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8387_to_fp16 = const()[name = tensor<string, []>("op_8387_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_215_cast_fp16 = layer_norm(axes = out_215_axes_0, epsilon = var_8387_to_fp16, x = inputs_215_cast_fp16)[name = tensor<string, []>("out_215_cast_fp16")];
-            tensor<fp16, [1024]> input_571_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_571_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(363709376)))];
-            tensor<fp16, [1024]> input_571_beta_0_to_fp16 = const()[name = tensor<string, []>("input_571_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(363711488)))];
+            tensor<fp16, [1024]> input_571_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_571_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(500301696)))];
+            tensor<fp16, [1024]> input_571_beta_0_to_fp16 = const()[name = tensor<string, []>("input_571_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(500303808)))];
             tensor<fp16, []> input_571_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_571_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_571_cast_fp16 = batch_norm(beta = input_571_beta_0_to_fp16, epsilon = input_571_epsilon_0_to_fp16, gamma = input_571_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_215_cast_fp16)[name = tensor<string, []>("input_571_cast_fp16")];
             tensor<string, []> var_8408_pad_type_0 = const()[name = tensor<string, []>("op_8408_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5967,14 +5967,14 @@ program(1.0)
             tensor<int32, [4]> var_8408_pad_0 = const()[name = tensor<string, []>("op_8408_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8408_dilations_0 = const()[name = tensor<string, []>("op_8408_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8408_groups_0 = const()[name = tensor<string, []>("op_8408_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_21_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(363713600))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(364762240))), name = tensor<string, []>("layers_21_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_21_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(500305920))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(501878848))), name = tensor<string, []>("layers_21_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_8408_cast_fp16 = conv(dilations = var_8408_dilations_0, groups = var_8408_groups_0, pad = var_8408_pad_0, pad_type = var_8408_pad_type_0, strides = var_8408_strides_0, weight = layers_21_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_571_cast_fp16)[name = tensor<string, []>("op_8408_cast_fp16")];
             tensor<string, []> var_8414_pad_type_0 = const()[name = tensor<string, []>("op_8414_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8414_strides_0 = const()[name = tensor<string, []>("op_8414_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8414_pad_0 = const()[name = tensor<string, []>("op_8414_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8414_dilations_0 = const()[name = tensor<string, []>("op_8414_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8414_groups_0 = const()[name = tensor<string, []>("op_8414_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_21_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(364828928))), name = tensor<string, []>("layers_21_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33223]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(364762368))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_21_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(501945600))), name = tensor<string, []>("layers_21_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [33223]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(501879040))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_8414_cast_fp16 = conv(dilations = var_8414_dilations_0, groups = var_8414_groups_0, pad = var_8414_pad_0, pad_type = var_8414_pad_type_0, strides = var_8414_strides_0, weight = layers_21_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_571_cast_fp16)[name = tensor<string, []>("op_8414_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_573_cast_fp16 = add(x = var_8408_cast_fp16, y = var_8414_cast_fp16)[name = tensor<string, []>("input_573_cast_fp16")];
             tensor<int32, []> input_575_split_num_splits_0 = const()[name = tensor<string, []>("input_575_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -5987,8 +5987,8 @@ program(1.0)
             tensor<int32, []> input_577_groups_0 = const()[name = tensor<string, []>("input_577_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_577_strides_0 = const()[name = tensor<string, []>("input_577_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_577_dilations_0 = const()[name = tensor<string, []>("input_577_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_310_to_fp16 = const()[name = tensor<string, []>("const_310_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365091136)))];
-            tensor<fp16, [1024]> const_311_to_fp16 = const()[name = tensor<string, []>("const_311_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365109632)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_310_to_fp16 = const()[name = tensor<string, []>("const_310_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(502207808)))];
+            tensor<fp16, [1024]> const_311_to_fp16 = const()[name = tensor<string, []>("const_311_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(502226304)))];
             tensor<fp16, [1, 1024, 1, 188]> input_579_cast_fp16 = conv(bias = const_311_to_fp16, dilations = input_577_dilations_0, groups = input_577_groups_0, pad = input_577_pad_0, pad_type = input_577_pad_type_0, strides = input_577_strides_0, weight = const_310_to_fp16, x = input_575_cast_fp16)[name = tensor<string, []>("input_579_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_581_cast_fp16 = silu(x = input_579_cast_fp16)[name = tensor<string, []>("input_581_cast_fp16")];
             tensor<string, []> var_8436_pad_type_0 = const()[name = tensor<string, []>("op_8436_pad_type_0"), val = tensor<string, []>("valid")];
@@ -5996,22 +5996,22 @@ program(1.0)
             tensor<int32, [4]> var_8436_pad_0 = const()[name = tensor<string, []>("op_8436_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8436_dilations_0 = const()[name = tensor<string, []>("op_8436_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8436_groups_0 = const()[name = tensor<string, []>("op_8436_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365111744))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365636096))), name = tensor<string, []>("layers_21_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(502228416))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(503014912))), name = tensor<string, []>("layers_21_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8436_cast_fp16 = conv(dilations = var_8436_dilations_0, groups = var_8436_groups_0, pad = var_8436_pad_0, pad_type = var_8436_pad_type_0, strides = var_8436_strides_0, weight = layers_21_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_581_cast_fp16)[name = tensor<string, []>("op_8436_cast_fp16")];
             tensor<string, []> var_8442_pad_type_0 = const()[name = tensor<string, []>("op_8442_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8442_strides_0 = const()[name = tensor<string, []>("op_8442_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8442_pad_0 = const()[name = tensor<string, []>("op_8442_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8442_dilations_0 = const()[name = tensor<string, []>("op_8442_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8442_groups_0 = const()[name = tensor<string, []>("op_8442_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_21_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365669248))), name = tensor<string, []>("layers_21_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16455]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365636224))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_21_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(503048128))), name = tensor<string, []>("layers_21_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16455]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(503015104))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8442_cast_fp16 = conv(dilations = var_8442_dilations_0, groups = var_8442_groups_0, pad = var_8442_pad_0, pad_type = var_8442_pad_type_0, strides = var_8442_strides_0, weight = layers_21_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_581_cast_fp16)[name = tensor<string, []>("op_8442_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_131_cast_fp16 = add(x = var_8436_cast_fp16, y = var_8442_cast_fp16)[name = tensor<string, []>("x_131_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_217_cast_fp16 = add(x = inputs_215_cast_fp16, y = x_131_cast_fp16)[name = tensor<string, []>("inputs_217_cast_fp16")];
             tensor<int32, [1]> out_217_axes_0 = const()[name = tensor<string, []>("out_217_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8453_to_fp16 = const()[name = tensor<string, []>("op_8453_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_217_cast_fp16 = layer_norm(axes = out_217_axes_0, epsilon = var_8453_to_fp16, x = inputs_217_cast_fp16)[name = tensor<string, []>("out_217_cast_fp16")];
-            tensor<fp16, [1024]> input_583_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_583_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365800384)))];
-            tensor<fp16, [1024]> input_583_beta_0_to_fp16 = const()[name = tensor<string, []>("input_583_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365802496)))];
+            tensor<fp16, [1024]> input_583_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_583_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(503179264)))];
+            tensor<fp16, [1024]> input_583_beta_0_to_fp16 = const()[name = tensor<string, []>("input_583_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(503181376)))];
             tensor<fp16, []> input_583_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_583_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_583_cast_fp16 = batch_norm(beta = input_583_beta_0_to_fp16, epsilon = input_583_epsilon_0_to_fp16, gamma = input_583_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_217_cast_fp16)[name = tensor<string, []>("input_583_cast_fp16")];
             tensor<string, []> var_8473_pad_type_0 = const()[name = tensor<string, []>("op_8473_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6019,14 +6019,14 @@ program(1.0)
             tensor<int32, [4]> var_8473_pad_0 = const()[name = tensor<string, []>("op_8473_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8473_dilations_0 = const()[name = tensor<string, []>("op_8473_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8473_groups_0 = const()[name = tensor<string, []>("op_8473_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_21_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(365804608))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(367901824))), name = tensor<string, []>("layers_21_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_21_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(503183488))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(506329280))), name = tensor<string, []>("layers_21_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8473_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_8473_dilations_0, groups = var_8473_groups_0, pad = var_8473_pad_0, pad_type = var_8473_pad_type_0, strides = var_8473_strides_0, weight = layers_21_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_583_cast_fp16)[name = tensor<string, []>("op_8473_cast_fp16")];
             tensor<string, []> var_8479_pad_type_0 = const()[name = tensor<string, []>("op_8479_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8479_strides_0 = const()[name = tensor<string, []>("op_8479_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8479_pad_0 = const()[name = tensor<string, []>("op_8479_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8479_dilations_0 = const()[name = tensor<string, []>("op_8479_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8479_groups_0 = const()[name = tensor<string, []>("op_8479_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_21_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(368032768))), name = tensor<string, []>("layers_21_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [65370]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(367901952))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_21_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(506460288))), name = tensor<string, []>("layers_21_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [65370]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(506329472))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8479_cast_fp16 = conv(dilations = var_8479_dilations_0, groups = var_8479_groups_0, pad = var_8479_pad_0, pad_type = var_8479_pad_type_0, strides = var_8479_strides_0, weight = layers_21_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_583_cast_fp16)[name = tensor<string, []>("op_8479_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_585_cast_fp16 = add(x = var_8473_cast_fp16, y = var_8479_cast_fp16)[name = tensor<string, []>("input_585_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_587_cast_fp16 = silu(x = input_585_cast_fp16)[name = tensor<string, []>("input_587_cast_fp16")];
@@ -6035,14 +6035,14 @@ program(1.0)
             tensor<int32, [4]> var_8490_pad_0 = const()[name = tensor<string, []>("op_8490_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8490_dilations_0 = const()[name = tensor<string, []>("op_8490_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8490_groups_0 = const()[name = tensor<string, []>("op_8490_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_21_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(368557120))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(370654336))), name = tensor<string, []>("layers_21_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_21_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(506984640))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(510130432))), name = tensor<string, []>("layers_21_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8490_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8490_dilations_0, groups = var_8490_groups_0, pad = var_8490_pad_0, pad_type = var_8490_pad_type_0, strides = var_8490_strides_0, weight = layers_21_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_587_cast_fp16)[name = tensor<string, []>("op_8490_cast_fp16")];
             tensor<string, []> var_8496_pad_type_0 = const()[name = tensor<string, []>("op_8496_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8496_strides_0 = const()[name = tensor<string, []>("op_8496_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8496_pad_0 = const()[name = tensor<string, []>("op_8496_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8496_dilations_0 = const()[name = tensor<string, []>("op_8496_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8496_groups_0 = const()[name = tensor<string, []>("op_8496_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_21_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(370812288))), name = tensor<string, []>("layers_21_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [78872]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(370654464))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_21_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(510288448))), name = tensor<string, []>("layers_21_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [78872]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(510130624))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8496_cast_fp16 = conv(dilations = var_8496_dilations_0, groups = var_8496_groups_0, pad = var_8496_pad_0, pad_type = var_8496_pad_type_0, strides = var_8496_strides_0, weight = layers_21_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_587_cast_fp16)[name = tensor<string, []>("op_8496_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_133_cast_fp16 = add(x = var_8490_cast_fp16, y = var_8496_cast_fp16)[name = tensor<string, []>("x_133_cast_fp16")];
             tensor<fp16, []> var_8498_to_fp16 = const()[name = tensor<string, []>("op_8498_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -6051,16 +6051,16 @@ program(1.0)
             tensor<int32, [1]> out_219_axes_0 = const()[name = tensor<string, []>("out_219_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8509_to_fp16 = const()[name = tensor<string, []>("op_8509_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_219_cast_fp16 = layer_norm(axes = out_219_axes_0, epsilon = var_8509_to_fp16, x = inputs_219_cast_fp16)[name = tensor<string, []>("out_219_cast_fp16")];
-            tensor<fp16, [1024]> inputs_221_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_221_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(371336640)))];
-            tensor<fp16, [1024]> inputs_221_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_221_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(371338752)))];
+            tensor<fp16, [1024]> inputs_221_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_221_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(510812800)))];
+            tensor<fp16, [1024]> inputs_221_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_221_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(510814912)))];
             tensor<fp16, []> inputs_221_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_221_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_221_cast_fp16 = batch_norm(beta = inputs_221_beta_0_to_fp16, epsilon = inputs_221_epsilon_0_to_fp16, gamma = inputs_221_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_219_cast_fp16)[name = tensor<string, []>("inputs_221_cast_fp16")];
             tensor<int32, []> var_8523 = const()[name = tensor<string, []>("op_8523"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_221_axes_0 = const()[name = tensor<string, []>("out_221_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8554_to_fp16 = const()[name = tensor<string, []>("op_8554_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_221_cast_fp16 = layer_norm(axes = out_221_axes_0, epsilon = var_8554_to_fp16, x = inputs_221_cast_fp16)[name = tensor<string, []>("out_221_cast_fp16")];
-            tensor<fp16, [1024]> input_589_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_589_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(371340864)))];
-            tensor<fp16, [1024]> input_589_beta_0_to_fp16 = const()[name = tensor<string, []>("input_589_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(371342976)))];
+            tensor<fp16, [1024]> input_589_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_589_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(510817024)))];
+            tensor<fp16, [1024]> input_589_beta_0_to_fp16 = const()[name = tensor<string, []>("input_589_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(510819136)))];
             tensor<fp16, []> input_589_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_589_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_589_cast_fp16 = batch_norm(beta = input_589_beta_0_to_fp16, epsilon = input_589_epsilon_0_to_fp16, gamma = input_589_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_221_cast_fp16)[name = tensor<string, []>("input_589_cast_fp16")];
             tensor<string, []> var_8574_pad_type_0 = const()[name = tensor<string, []>("op_8574_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6068,14 +6068,14 @@ program(1.0)
             tensor<int32, [4]> var_8574_pad_0 = const()[name = tensor<string, []>("op_8574_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8574_dilations_0 = const()[name = tensor<string, []>("op_8574_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8574_groups_0 = const()[name = tensor<string, []>("op_8574_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_22_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(371345088))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(373442304))), name = tensor<string, []>("layers_22_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_22_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(510821248))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(513967040))), name = tensor<string, []>("layers_22_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8574_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_8574_dilations_0, groups = var_8574_groups_0, pad = var_8574_pad_0, pad_type = var_8574_pad_type_0, strides = var_8574_strides_0, weight = layers_22_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_589_cast_fp16)[name = tensor<string, []>("op_8574_cast_fp16")];
             tensor<string, []> var_8580_pad_type_0 = const()[name = tensor<string, []>("op_8580_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8580_strides_0 = const()[name = tensor<string, []>("op_8580_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8580_pad_0 = const()[name = tensor<string, []>("op_8580_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8580_dilations_0 = const()[name = tensor<string, []>("op_8580_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8580_groups_0 = const()[name = tensor<string, []>("op_8580_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_22_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(373571136))), name = tensor<string, []>("layers_22_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [64320]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(373442432))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_22_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(514095936))), name = tensor<string, []>("layers_22_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [64320]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(513967232))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8580_cast_fp16 = conv(dilations = var_8580_dilations_0, groups = var_8580_groups_0, pad = var_8580_pad_0, pad_type = var_8580_pad_type_0, strides = var_8580_strides_0, weight = layers_22_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_589_cast_fp16)[name = tensor<string, []>("op_8580_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_591_cast_fp16 = add(x = var_8574_cast_fp16, y = var_8580_cast_fp16)[name = tensor<string, []>("input_591_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_593_cast_fp16 = silu(x = input_591_cast_fp16)[name = tensor<string, []>("input_593_cast_fp16")];
@@ -6084,14 +6084,14 @@ program(1.0)
             tensor<int32, [4]> var_8591_pad_0 = const()[name = tensor<string, []>("op_8591_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8591_dilations_0 = const()[name = tensor<string, []>("op_8591_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8591_groups_0 = const()[name = tensor<string, []>("op_8591_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_22_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(374095488))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(376192704))), name = tensor<string, []>("layers_22_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_22_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(514620288))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(517766080))), name = tensor<string, []>("layers_22_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8591_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8591_dilations_0, groups = var_8591_groups_0, pad = var_8591_pad_0, pad_type = var_8591_pad_type_0, strides = var_8591_strides_0, weight = layers_22_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_593_cast_fp16)[name = tensor<string, []>("op_8591_cast_fp16")];
             tensor<string, []> var_8597_pad_type_0 = const()[name = tensor<string, []>("op_8597_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8597_strides_0 = const()[name = tensor<string, []>("op_8597_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8597_pad_0 = const()[name = tensor<string, []>("op_8597_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8597_dilations_0 = const()[name = tensor<string, []>("op_8597_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8597_groups_0 = const()[name = tensor<string, []>("op_8597_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_22_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(376358208))), name = tensor<string, []>("layers_22_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82630]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(376192832))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_22_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(517931648))), name = tensor<string, []>("layers_22_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [82630]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(517766272))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8597_cast_fp16 = conv(dilations = var_8597_dilations_0, groups = var_8597_groups_0, pad = var_8597_pad_0, pad_type = var_8597_pad_type_0, strides = var_8597_strides_0, weight = layers_22_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_593_cast_fp16)[name = tensor<string, []>("op_8597_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_135_cast_fp16 = add(x = var_8591_cast_fp16, y = var_8597_cast_fp16)[name = tensor<string, []>("x_135_cast_fp16")];
             tensor<fp16, []> var_8599_to_fp16 = const()[name = tensor<string, []>("op_8599_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -6100,8 +6100,8 @@ program(1.0)
             tensor<int32, [1]> out_223_axes_0 = const()[name = tensor<string, []>("out_223_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8610_to_fp16 = const()[name = tensor<string, []>("op_8610_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_223_cast_fp16 = layer_norm(axes = out_223_axes_0, epsilon = var_8610_to_fp16, x = inputs_223_cast_fp16)[name = tensor<string, []>("out_223_cast_fp16")];
-            tensor<fp16, [1024]> obj_91_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_91_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(376882560)))];
-            tensor<fp16, [1024]> obj_91_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_91_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(376884672)))];
+            tensor<fp16, [1024]> obj_91_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_91_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(518456000)))];
+            tensor<fp16, [1024]> obj_91_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_91_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(518458112)))];
             tensor<fp16, []> obj_91_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_91_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_91_cast_fp16 = batch_norm(beta = obj_91_beta_0_to_fp16, epsilon = obj_91_epsilon_0_to_fp16, gamma = obj_91_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_223_cast_fp16)[name = tensor<string, []>("obj_91_cast_fp16")];
             tensor<string, []> var_8635_pad_type_0 = const()[name = tensor<string, []>("op_8635_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6109,14 +6109,14 @@ program(1.0)
             tensor<int32, [4]> var_8635_pad_0 = const()[name = tensor<string, []>("op_8635_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8635_dilations_0 = const()[name = tensor<string, []>("op_8635_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8635_groups_0 = const()[name = tensor<string, []>("op_8635_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(376886784))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(377411136))), name = tensor<string, []>("layers_22_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(518460224))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(519246720))), name = tensor<string, []>("layers_22_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8635_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8635_dilations_0, groups = var_8635_groups_0, pad = var_8635_pad_0, pad_type = var_8635_pad_type_0, strides = var_8635_strides_0, weight = layers_22_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_91_cast_fp16)[name = tensor<string, []>("op_8635_cast_fp16")];
             tensor<string, []> var_8641_pad_type_0 = const()[name = tensor<string, []>("op_8641_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8641_strides_0 = const()[name = tensor<string, []>("op_8641_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8641_pad_0 = const()[name = tensor<string, []>("op_8641_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8641_dilations_0 = const()[name = tensor<string, []>("op_8641_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8641_groups_0 = const()[name = tensor<string, []>("op_8641_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(377442240))), name = tensor<string, []>("layers_22_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15448]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(377411264))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(519277888))), name = tensor<string, []>("layers_22_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15448]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(519246912))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8641_cast_fp16 = conv(dilations = var_8641_dilations_0, groups = var_8641_groups_0, pad = var_8641_pad_0, pad_type = var_8641_pad_type_0, strides = var_8641_strides_0, weight = layers_22_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_91_cast_fp16)[name = tensor<string, []>("op_8641_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_89_cast_fp16 = add(x = var_8635_cast_fp16, y = var_8641_cast_fp16)[name = tensor<string, []>("query_89_cast_fp16")];
             tensor<string, []> var_8650_pad_type_0 = const()[name = tensor<string, []>("op_8650_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6124,14 +6124,14 @@ program(1.0)
             tensor<int32, [4]> var_8650_pad_0 = const()[name = tensor<string, []>("op_8650_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8650_dilations_0 = const()[name = tensor<string, []>("op_8650_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8650_groups_0 = const()[name = tensor<string, []>("op_8650_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(377573376))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378097728))), name = tensor<string, []>("layers_22_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(519409024))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(520195520))), name = tensor<string, []>("layers_22_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8650_cast_fp16 = conv(dilations = var_8650_dilations_0, groups = var_8650_groups_0, pad = var_8650_pad_0, pad_type = var_8650_pad_type_0, strides = var_8650_strides_0, weight = layers_22_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_91_cast_fp16)[name = tensor<string, []>("op_8650_cast_fp16")];
             tensor<string, []> var_8656_pad_type_0 = const()[name = tensor<string, []>("op_8656_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8656_strides_0 = const()[name = tensor<string, []>("op_8656_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8656_pad_0 = const()[name = tensor<string, []>("op_8656_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8656_dilations_0 = const()[name = tensor<string, []>("op_8656_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8656_groups_0 = const()[name = tensor<string, []>("op_8656_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378129408))), name = tensor<string, []>("layers_22_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15744]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378097856))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(520227264))), name = tensor<string, []>("layers_22_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15744]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(520195712))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8656_cast_fp16 = conv(dilations = var_8656_dilations_0, groups = var_8656_groups_0, pad = var_8656_pad_0, pad_type = var_8656_pad_type_0, strides = var_8656_strides_0, weight = layers_22_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_91_cast_fp16)[name = tensor<string, []>("op_8656_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_45_cast_fp16 = add(x = var_8650_cast_fp16, y = var_8656_cast_fp16)[name = tensor<string, []>("key_45_cast_fp16")];
             tensor<string, []> var_8666_pad_type_0 = const()[name = tensor<string, []>("op_8666_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6139,33 +6139,33 @@ program(1.0)
             tensor<int32, [4]> var_8666_pad_0 = const()[name = tensor<string, []>("op_8666_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8666_dilations_0 = const()[name = tensor<string, []>("op_8666_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8666_groups_0 = const()[name = tensor<string, []>("op_8666_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378260544))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378784896))), name = tensor<string, []>("layers_22_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(520358400))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(521144896))), name = tensor<string, []>("layers_22_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8666_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8666_dilations_0, groups = var_8666_groups_0, pad = var_8666_pad_0, pad_type = var_8666_pad_type_0, strides = var_8666_strides_0, weight = layers_22_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_91_cast_fp16)[name = tensor<string, []>("op_8666_cast_fp16")];
             tensor<string, []> var_8672_pad_type_0 = const()[name = tensor<string, []>("op_8672_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8672_strides_0 = const()[name = tensor<string, []>("op_8672_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8672_pad_0 = const()[name = tensor<string, []>("op_8672_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8672_dilations_0 = const()[name = tensor<string, []>("op_8672_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8672_groups_0 = const()[name = tensor<string, []>("op_8672_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378816064))), name = tensor<string, []>("layers_22_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15472]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378785024))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(521176128))), name = tensor<string, []>("layers_22_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15472]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(521145088))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8672_cast_fp16 = conv(dilations = var_8672_dilations_0, groups = var_8672_groups_0, pad = var_8672_pad_0, pad_type = var_8672_pad_type_0, strides = var_8672_strides_0, weight = layers_22_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_91_cast_fp16)[name = tensor<string, []>("op_8672_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_45_cast_fp16 = add(x = var_8666_cast_fp16, y = var_8672_cast_fp16)[name = tensor<string, []>("value_45_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_8675_to_fp16 = const()[name = tensor<string, []>("op_8675_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378947200)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_8675_to_fp16 = const()[name = tensor<string, []>("op_8675_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(521307264)))];
             tensor<fp16, [1, 1024, 1, 188]> query_91_cast_fp16 = add(x = query_89_cast_fp16, y = var_8675_to_fp16)[name = tensor<string, []>("query_91_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_8678_to_fp16 = const()[name = tensor<string, []>("op_8678_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378949312)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_8678_to_fp16 = const()[name = tensor<string, []>("op_8678_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(521309376)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_45_cast_fp16 = add(x = query_89_cast_fp16, y = var_8678_to_fp16)[name = tensor<string, []>("q_with_bias_v_45_cast_fp16")];
             tensor<string, []> var_8688_pad_type_0 = const()[name = tensor<string, []>("op_8688_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8688_strides_0 = const()[name = tensor<string, []>("op_8688_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8688_pad_0 = const()[name = tensor<string, []>("op_8688_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8688_dilations_0 = const()[name = tensor<string, []>("op_8688_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8688_groups_0 = const()[name = tensor<string, []>("op_8688_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(378951424))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(379475776))), name = tensor<string, []>("layers_22_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(521311488))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(522097984))), name = tensor<string, []>("layers_22_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_8688_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8688_dilations_0, groups = var_8688_groups_0, pad = var_8688_pad_0, pad_type = var_8688_pad_type_0, strides = var_8688_strides_0, weight = layers_22_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_8688_cast_fp16")];
             tensor<string, []> var_8694_pad_type_0 = const()[name = tensor<string, []>("op_8694_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8694_strides_0 = const()[name = tensor<string, []>("op_8694_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8694_pad_0 = const()[name = tensor<string, []>("op_8694_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8694_dilations_0 = const()[name = tensor<string, []>("op_8694_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8694_groups_0 = const()[name = tensor<string, []>("op_8694_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(379553152))), name = tensor<string, []>("layers_22_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [38586]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(379475904))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(522175424))), name = tensor<string, []>("layers_22_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [38586]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(522098176))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_8694_cast_fp16 = conv(dilations = var_8694_dilations_0, groups = var_8694_groups_0, pad = var_8694_pad_0, pad_type = var_8694_pad_type_0, strides = var_8694_strides_0, weight = layers_22_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_8694_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_45_cast_fp16 = add(x = var_8688_cast_fp16, y = var_8694_cast_fp16)[name = tensor<string, []>("p_45_cast_fp16")];
             tensor<int32, [4]> var_8698 = const()[name = tensor<string, []>("op_8698"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -6216,22 +6216,22 @@ program(1.0)
             tensor<int32, [4]> var_8751_pad_0 = const()[name = tensor<string, []>("op_8751_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8751_dilations_0 = const()[name = tensor<string, []>("op_8751_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8751_groups_0 = const()[name = tensor<string, []>("op_8751_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(379684288))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380208640))), name = tensor<string, []>("layers_22_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(522306560))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(523093056))), name = tensor<string, []>("layers_22_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8751_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8751_dilations_0, groups = var_8751_groups_0, pad = var_8751_pad_0, pad_type = var_8751_pad_type_0, strides = var_8751_strides_0, weight = layers_22_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_595_cast_fp16)[name = tensor<string, []>("op_8751_cast_fp16")];
             tensor<string, []> var_8757_pad_type_0 = const()[name = tensor<string, []>("op_8757_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8757_strides_0 = const()[name = tensor<string, []>("op_8757_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8757_pad_0 = const()[name = tensor<string, []>("op_8757_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8757_dilations_0 = const()[name = tensor<string, []>("op_8757_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8757_groups_0 = const()[name = tensor<string, []>("op_8757_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380239424))), name = tensor<string, []>("layers_22_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15282]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380208768))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(523123904))), name = tensor<string, []>("layers_22_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [15282]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(523093248))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8757_cast_fp16 = conv(dilations = var_8757_dilations_0, groups = var_8757_groups_0, pad = var_8757_pad_0, pad_type = var_8757_pad_type_0, strides = var_8757_strides_0, weight = layers_22_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_595_cast_fp16)[name = tensor<string, []>("op_8757_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_93_cast_fp16 = add(x = var_8751_cast_fp16, y = var_8757_cast_fp16)[name = tensor<string, []>("obj_93_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_225_cast_fp16 = add(x = inputs_223_cast_fp16, y = obj_93_cast_fp16)[name = tensor<string, []>("inputs_225_cast_fp16")];
             tensor<int32, [1]> out_225_axes_0 = const()[name = tensor<string, []>("out_225_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8768_to_fp16 = const()[name = tensor<string, []>("op_8768_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_225_cast_fp16 = layer_norm(axes = out_225_axes_0, epsilon = var_8768_to_fp16, x = inputs_225_cast_fp16)[name = tensor<string, []>("out_225_cast_fp16")];
-            tensor<fp16, [1024]> input_597_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_597_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380370560)))];
-            tensor<fp16, [1024]> input_597_beta_0_to_fp16 = const()[name = tensor<string, []>("input_597_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380372672)))];
+            tensor<fp16, [1024]> input_597_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_597_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(523255040)))];
+            tensor<fp16, [1024]> input_597_beta_0_to_fp16 = const()[name = tensor<string, []>("input_597_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(523257152)))];
             tensor<fp16, []> input_597_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_597_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_597_cast_fp16 = batch_norm(beta = input_597_beta_0_to_fp16, epsilon = input_597_epsilon_0_to_fp16, gamma = input_597_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_225_cast_fp16)[name = tensor<string, []>("input_597_cast_fp16")];
             tensor<string, []> var_8789_pad_type_0 = const()[name = tensor<string, []>("op_8789_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6239,14 +6239,14 @@ program(1.0)
             tensor<int32, [4]> var_8789_pad_0 = const()[name = tensor<string, []>("op_8789_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8789_dilations_0 = const()[name = tensor<string, []>("op_8789_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8789_groups_0 = const()[name = tensor<string, []>("op_8789_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_22_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(380374784))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381423424))), name = tensor<string, []>("layers_22_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_22_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(523259264))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(524832192))), name = tensor<string, []>("layers_22_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_8789_cast_fp16 = conv(dilations = var_8789_dilations_0, groups = var_8789_groups_0, pad = var_8789_pad_0, pad_type = var_8789_pad_type_0, strides = var_8789_strides_0, weight = layers_22_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_597_cast_fp16)[name = tensor<string, []>("op_8789_cast_fp16")];
             tensor<string, []> var_8795_pad_type_0 = const()[name = tensor<string, []>("op_8795_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8795_strides_0 = const()[name = tensor<string, []>("op_8795_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8795_pad_0 = const()[name = tensor<string, []>("op_8795_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8795_dilations_0 = const()[name = tensor<string, []>("op_8795_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8795_groups_0 = const()[name = tensor<string, []>("op_8795_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_22_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381496576))), name = tensor<string, []>("layers_22_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [36470]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381423552))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_22_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(524905408))), name = tensor<string, []>("layers_22_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [36470]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(524832384))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_8795_cast_fp16 = conv(dilations = var_8795_dilations_0, groups = var_8795_groups_0, pad = var_8795_pad_0, pad_type = var_8795_pad_type_0, strides = var_8795_strides_0, weight = layers_22_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_597_cast_fp16)[name = tensor<string, []>("op_8795_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_599_cast_fp16 = add(x = var_8789_cast_fp16, y = var_8795_cast_fp16)[name = tensor<string, []>("input_599_cast_fp16")];
             tensor<int32, []> input_601_split_num_splits_0 = const()[name = tensor<string, []>("input_601_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -6259,8 +6259,8 @@ program(1.0)
             tensor<int32, []> input_603_groups_0 = const()[name = tensor<string, []>("input_603_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_603_strides_0 = const()[name = tensor<string, []>("input_603_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_603_dilations_0 = const()[name = tensor<string, []>("input_603_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_312_to_fp16 = const()[name = tensor<string, []>("const_312_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381758784)))];
-            tensor<fp16, [1024]> const_313_to_fp16 = const()[name = tensor<string, []>("const_313_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381777280)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_312_to_fp16 = const()[name = tensor<string, []>("const_312_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(525167616)))];
+            tensor<fp16, [1024]> const_313_to_fp16 = const()[name = tensor<string, []>("const_313_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(525186112)))];
             tensor<fp16, [1, 1024, 1, 188]> input_605_cast_fp16 = conv(bias = const_313_to_fp16, dilations = input_603_dilations_0, groups = input_603_groups_0, pad = input_603_pad_0, pad_type = input_603_pad_type_0, strides = input_603_strides_0, weight = const_312_to_fp16, x = input_601_cast_fp16)[name = tensor<string, []>("input_605_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_607_cast_fp16 = silu(x = input_605_cast_fp16)[name = tensor<string, []>("input_607_cast_fp16")];
             tensor<string, []> var_8817_pad_type_0 = const()[name = tensor<string, []>("op_8817_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6268,22 +6268,22 @@ program(1.0)
             tensor<int32, [4]> var_8817_pad_0 = const()[name = tensor<string, []>("op_8817_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8817_dilations_0 = const()[name = tensor<string, []>("op_8817_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8817_groups_0 = const()[name = tensor<string, []>("op_8817_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(381779392))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382303744))), name = tensor<string, []>("layers_22_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(525188224))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(525974720))), name = tensor<string, []>("layers_22_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8817_cast_fp16 = conv(dilations = var_8817_dilations_0, groups = var_8817_groups_0, pad = var_8817_pad_0, pad_type = var_8817_pad_type_0, strides = var_8817_strides_0, weight = layers_22_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_607_cast_fp16)[name = tensor<string, []>("op_8817_cast_fp16")];
             tensor<string, []> var_8823_pad_type_0 = const()[name = tensor<string, []>("op_8823_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8823_strides_0 = const()[name = tensor<string, []>("op_8823_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8823_pad_0 = const()[name = tensor<string, []>("op_8823_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8823_dilations_0 = const()[name = tensor<string, []>("op_8823_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8823_groups_0 = const()[name = tensor<string, []>("op_8823_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_22_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382337984))), name = tensor<string, []>("layers_22_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17006]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382303872))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_22_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526009024))), name = tensor<string, []>("layers_22_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [17006]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(525974912))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8823_cast_fp16 = conv(dilations = var_8823_dilations_0, groups = var_8823_groups_0, pad = var_8823_pad_0, pad_type = var_8823_pad_type_0, strides = var_8823_strides_0, weight = layers_22_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_607_cast_fp16)[name = tensor<string, []>("op_8823_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_137_cast_fp16 = add(x = var_8817_cast_fp16, y = var_8823_cast_fp16)[name = tensor<string, []>("x_137_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_227_cast_fp16 = add(x = inputs_225_cast_fp16, y = x_137_cast_fp16)[name = tensor<string, []>("inputs_227_cast_fp16")];
             tensor<int32, [1]> out_227_axes_0 = const()[name = tensor<string, []>("out_227_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8834_to_fp16 = const()[name = tensor<string, []>("op_8834_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_227_cast_fp16 = layer_norm(axes = out_227_axes_0, epsilon = var_8834_to_fp16, x = inputs_227_cast_fp16)[name = tensor<string, []>("out_227_cast_fp16")];
-            tensor<fp16, [1024]> input_609_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_609_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382469120)))];
-            tensor<fp16, [1024]> input_609_beta_0_to_fp16 = const()[name = tensor<string, []>("input_609_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382471232)))];
+            tensor<fp16, [1024]> input_609_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_609_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526140160)))];
+            tensor<fp16, [1024]> input_609_beta_0_to_fp16 = const()[name = tensor<string, []>("input_609_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526142272)))];
             tensor<fp16, []> input_609_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_609_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_609_cast_fp16 = batch_norm(beta = input_609_beta_0_to_fp16, epsilon = input_609_epsilon_0_to_fp16, gamma = input_609_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_227_cast_fp16)[name = tensor<string, []>("input_609_cast_fp16")];
             tensor<string, []> var_8854_pad_type_0 = const()[name = tensor<string, []>("op_8854_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6291,14 +6291,14 @@ program(1.0)
             tensor<int32, [4]> var_8854_pad_0 = const()[name = tensor<string, []>("op_8854_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8854_dilations_0 = const()[name = tensor<string, []>("op_8854_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8854_groups_0 = const()[name = tensor<string, []>("op_8854_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_22_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(382473344))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(384570560))), name = tensor<string, []>("layers_22_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_22_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526144384))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(529290176))), name = tensor<string, []>("layers_22_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8854_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_8854_dilations_0, groups = var_8854_groups_0, pad = var_8854_pad_0, pad_type = var_8854_pad_type_0, strides = var_8854_strides_0, weight = layers_22_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_609_cast_fp16)[name = tensor<string, []>("op_8854_cast_fp16")];
             tensor<string, []> var_8860_pad_type_0 = const()[name = tensor<string, []>("op_8860_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8860_strides_0 = const()[name = tensor<string, []>("op_8860_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8860_pad_0 = const()[name = tensor<string, []>("op_8860_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8860_dilations_0 = const()[name = tensor<string, []>("op_8860_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8860_groups_0 = const()[name = tensor<string, []>("op_8860_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_22_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(384708992))), name = tensor<string, []>("layers_22_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69118]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(384570688))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_22_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(529428672))), name = tensor<string, []>("layers_22_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [69118]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(529290368))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8860_cast_fp16 = conv(dilations = var_8860_dilations_0, groups = var_8860_groups_0, pad = var_8860_pad_0, pad_type = var_8860_pad_type_0, strides = var_8860_strides_0, weight = layers_22_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_609_cast_fp16)[name = tensor<string, []>("op_8860_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_611_cast_fp16 = add(x = var_8854_cast_fp16, y = var_8860_cast_fp16)[name = tensor<string, []>("input_611_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_613_cast_fp16 = silu(x = input_611_cast_fp16)[name = tensor<string, []>("input_613_cast_fp16")];
@@ -6307,14 +6307,14 @@ program(1.0)
             tensor<int32, [4]> var_8871_pad_0 = const()[name = tensor<string, []>("op_8871_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8871_dilations_0 = const()[name = tensor<string, []>("op_8871_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8871_groups_0 = const()[name = tensor<string, []>("op_8871_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_22_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(385233344))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(387330560))), name = tensor<string, []>("layers_22_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_22_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(529953024))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(533098816))), name = tensor<string, []>("layers_22_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8871_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8871_dilations_0, groups = var_8871_groups_0, pad = var_8871_pad_0, pad_type = var_8871_pad_type_0, strides = var_8871_strides_0, weight = layers_22_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_613_cast_fp16)[name = tensor<string, []>("op_8871_cast_fp16")];
             tensor<string, []> var_8877_pad_type_0 = const()[name = tensor<string, []>("op_8877_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8877_strides_0 = const()[name = tensor<string, []>("op_8877_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8877_pad_0 = const()[name = tensor<string, []>("op_8877_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8877_dilations_0 = const()[name = tensor<string, []>("op_8877_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8877_groups_0 = const()[name = tensor<string, []>("op_8877_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_22_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(387550912))), name = tensor<string, []>("layers_22_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [110071]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(387330688))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_22_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(533319232))), name = tensor<string, []>("layers_22_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [110071]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(533099008))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8877_cast_fp16 = conv(dilations = var_8877_dilations_0, groups = var_8877_groups_0, pad = var_8877_pad_0, pad_type = var_8877_pad_type_0, strides = var_8877_strides_0, weight = layers_22_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_613_cast_fp16)[name = tensor<string, []>("op_8877_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_139_cast_fp16 = add(x = var_8871_cast_fp16, y = var_8877_cast_fp16)[name = tensor<string, []>("x_139_cast_fp16")];
             tensor<fp16, []> var_8879_to_fp16 = const()[name = tensor<string, []>("op_8879_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -6323,16 +6323,16 @@ program(1.0)
             tensor<int32, [1]> out_229_axes_0 = const()[name = tensor<string, []>("out_229_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8890_to_fp16 = const()[name = tensor<string, []>("op_8890_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_229_cast_fp16 = layer_norm(axes = out_229_axes_0, epsilon = var_8890_to_fp16, x = inputs_229_cast_fp16)[name = tensor<string, []>("out_229_cast_fp16")];
-            tensor<fp16, [1024]> inputs_231_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_231_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388075264)))];
-            tensor<fp16, [1024]> inputs_231_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_231_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388077376)))];
+            tensor<fp16, [1024]> inputs_231_gamma_0_to_fp16 = const()[name = tensor<string, []>("inputs_231_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(533843584)))];
+            tensor<fp16, [1024]> inputs_231_beta_0_to_fp16 = const()[name = tensor<string, []>("inputs_231_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(533845696)))];
             tensor<fp16, []> inputs_231_epsilon_0_to_fp16 = const()[name = tensor<string, []>("inputs_231_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> inputs_231_cast_fp16 = batch_norm(beta = inputs_231_beta_0_to_fp16, epsilon = inputs_231_epsilon_0_to_fp16, gamma = inputs_231_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_229_cast_fp16)[name = tensor<string, []>("inputs_231_cast_fp16")];
             tensor<int32, []> var_8904 = const()[name = tensor<string, []>("op_8904"), val = tensor<int32, []>(3)];
             tensor<int32, [1]> out_231_axes_0 = const()[name = tensor<string, []>("out_231_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8935_to_fp16 = const()[name = tensor<string, []>("op_8935_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_231_cast_fp16 = layer_norm(axes = out_231_axes_0, epsilon = var_8935_to_fp16, x = inputs_231_cast_fp16)[name = tensor<string, []>("out_231_cast_fp16")];
-            tensor<fp16, [1024]> input_615_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_615_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388079488)))];
-            tensor<fp16, [1024]> input_615_beta_0_to_fp16 = const()[name = tensor<string, []>("input_615_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388081600)))];
+            tensor<fp16, [1024]> input_615_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_615_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(533847808)))];
+            tensor<fp16, [1024]> input_615_beta_0_to_fp16 = const()[name = tensor<string, []>("input_615_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(533849920)))];
             tensor<fp16, []> input_615_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_615_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_615_cast_fp16 = batch_norm(beta = input_615_beta_0_to_fp16, epsilon = input_615_epsilon_0_to_fp16, gamma = input_615_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_231_cast_fp16)[name = tensor<string, []>("input_615_cast_fp16")];
             tensor<string, []> var_8955_pad_type_0 = const()[name = tensor<string, []>("op_8955_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6340,14 +6340,14 @@ program(1.0)
             tensor<int32, [4]> var_8955_pad_0 = const()[name = tensor<string, []>("op_8955_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8955_dilations_0 = const()[name = tensor<string, []>("op_8955_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8955_groups_0 = const()[name = tensor<string, []>("op_8955_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_23_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(388083712))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(390180928))), name = tensor<string, []>("layers_23_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_23_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(533852032))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(536997824))), name = tensor<string, []>("layers_23_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8955_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_8955_dilations_0, groups = var_8955_groups_0, pad = var_8955_pad_0, pad_type = var_8955_pad_type_0, strides = var_8955_strides_0, weight = layers_23_feed_forward1_fc1_inlier_module_weight_to_fp16_palettized, x = input_615_cast_fp16)[name = tensor<string, []>("op_8955_cast_fp16")];
             tensor<string, []> var_8961_pad_type_0 = const()[name = tensor<string, []>("op_8961_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8961_strides_0 = const()[name = tensor<string, []>("op_8961_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8961_pad_0 = const()[name = tensor<string, []>("op_8961_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8961_dilations_0 = const()[name = tensor<string, []>("op_8961_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8961_groups_0 = const()[name = tensor<string, []>("op_8961_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_23_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(390308032))), name = tensor<string, []>("layers_23_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [63432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(390181056))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_23_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(537124992))), name = tensor<string, []>("layers_23_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [63432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(536998016))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_8961_cast_fp16 = conv(dilations = var_8961_dilations_0, groups = var_8961_groups_0, pad = var_8961_pad_0, pad_type = var_8961_pad_type_0, strides = var_8961_strides_0, weight = layers_23_feed_forward1_fc1_outlier_module_weight_to_fp16_sparsified, x = input_615_cast_fp16)[name = tensor<string, []>("op_8961_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_617_cast_fp16 = add(x = var_8955_cast_fp16, y = var_8961_cast_fp16)[name = tensor<string, []>("input_617_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_619_cast_fp16 = silu(x = input_617_cast_fp16)[name = tensor<string, []>("input_619_cast_fp16")];
@@ -6356,14 +6356,14 @@ program(1.0)
             tensor<int32, [4]> var_8972_pad_0 = const()[name = tensor<string, []>("op_8972_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8972_dilations_0 = const()[name = tensor<string, []>("op_8972_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8972_groups_0 = const()[name = tensor<string, []>("op_8972_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_23_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(390832384))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(392929600))), name = tensor<string, []>("layers_23_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_23_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(537649344))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(540795136))), name = tensor<string, []>("layers_23_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8972_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_8972_dilations_0, groups = var_8972_groups_0, pad = var_8972_pad_0, pad_type = var_8972_pad_type_0, strides = var_8972_strides_0, weight = layers_23_feed_forward1_fc2_inlier_module_weight_to_fp16_palettized, x = input_619_cast_fp16)[name = tensor<string, []>("op_8972_cast_fp16")];
             tensor<string, []> var_8978_pad_type_0 = const()[name = tensor<string, []>("op_8978_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_8978_strides_0 = const()[name = tensor<string, []>("op_8978_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_8978_pad_0 = const()[name = tensor<string, []>("op_8978_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_8978_dilations_0 = const()[name = tensor<string, []>("op_8978_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_8978_groups_0 = const()[name = tensor<string, []>("op_8978_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_23_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(393142080))), name = tensor<string, []>("layers_23_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [106123]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(392929728))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_23_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(541007680))), name = tensor<string, []>("layers_23_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [106123]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(540795328))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_8978_cast_fp16 = conv(dilations = var_8978_dilations_0, groups = var_8978_groups_0, pad = var_8978_pad_0, pad_type = var_8978_pad_type_0, strides = var_8978_strides_0, weight = layers_23_feed_forward1_fc2_outlier_module_weight_to_fp16_sparsified, x = input_619_cast_fp16)[name = tensor<string, []>("op_8978_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_141_cast_fp16 = add(x = var_8972_cast_fp16, y = var_8978_cast_fp16)[name = tensor<string, []>("x_141_cast_fp16")];
             tensor<fp16, []> var_8980_to_fp16 = const()[name = tensor<string, []>("op_8980_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -6372,8 +6372,8 @@ program(1.0)
             tensor<int32, [1]> out_233_axes_0 = const()[name = tensor<string, []>("out_233_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_8991_to_fp16 = const()[name = tensor<string, []>("op_8991_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_233_cast_fp16 = layer_norm(axes = out_233_axes_0, epsilon = var_8991_to_fp16, x = inputs_233_cast_fp16)[name = tensor<string, []>("out_233_cast_fp16")];
-            tensor<fp16, [1024]> obj_95_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_95_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(393666432)))];
-            tensor<fp16, [1024]> obj_95_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_95_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(393668544)))];
+            tensor<fp16, [1024]> obj_95_gamma_0_to_fp16 = const()[name = tensor<string, []>("obj_95_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(541532032)))];
+            tensor<fp16, [1024]> obj_95_beta_0_to_fp16 = const()[name = tensor<string, []>("obj_95_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(541534144)))];
             tensor<fp16, []> obj_95_epsilon_0_to_fp16 = const()[name = tensor<string, []>("obj_95_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> obj_95_cast_fp16 = batch_norm(beta = obj_95_beta_0_to_fp16, epsilon = obj_95_epsilon_0_to_fp16, gamma = obj_95_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_233_cast_fp16)[name = tensor<string, []>("obj_95_cast_fp16")];
             tensor<string, []> var_9016_pad_type_0 = const()[name = tensor<string, []>("op_9016_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6381,14 +6381,14 @@ program(1.0)
             tensor<int32, [4]> var_9016_pad_0 = const()[name = tensor<string, []>("op_9016_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9016_dilations_0 = const()[name = tensor<string, []>("op_9016_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9016_groups_0 = const()[name = tensor<string, []>("op_9016_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(393670656))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(394195008))), name = tensor<string, []>("layers_23_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_q_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(541536256))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(542322752))), name = tensor<string, []>("layers_23_self_attn_q_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9016_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_9016_dilations_0, groups = var_9016_groups_0, pad = var_9016_pad_0, pad_type = var_9016_pad_type_0, strides = var_9016_strides_0, weight = layers_23_self_attn_q_proj_inlier_module_weight_to_fp16_palettized, x = obj_95_cast_fp16)[name = tensor<string, []>("op_9016_cast_fp16")];
             tensor<string, []> var_9022_pad_type_0 = const()[name = tensor<string, []>("op_9022_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9022_strides_0 = const()[name = tensor<string, []>("op_9022_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9022_pad_0 = const()[name = tensor<string, []>("op_9022_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9022_dilations_0 = const()[name = tensor<string, []>("op_9022_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9022_groups_0 = const()[name = tensor<string, []>("op_9022_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(394224000))), name = tensor<string, []>("layers_23_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14379]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(394195136))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(542351808))), name = tensor<string, []>("layers_23_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14379]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(542322944))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9022_cast_fp16 = conv(dilations = var_9022_dilations_0, groups = var_9022_groups_0, pad = var_9022_pad_0, pad_type = var_9022_pad_type_0, strides = var_9022_strides_0, weight = layers_23_self_attn_q_proj_outlier_module_weight_to_fp16_sparsified, x = obj_95_cast_fp16)[name = tensor<string, []>("op_9022_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> query_93_cast_fp16 = add(x = var_9016_cast_fp16, y = var_9022_cast_fp16)[name = tensor<string, []>("query_93_cast_fp16")];
             tensor<string, []> var_9031_pad_type_0 = const()[name = tensor<string, []>("op_9031_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6396,14 +6396,14 @@ program(1.0)
             tensor<int32, [4]> var_9031_pad_0 = const()[name = tensor<string, []>("op_9031_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9031_dilations_0 = const()[name = tensor<string, []>("op_9031_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9031_groups_0 = const()[name = tensor<string, []>("op_9031_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(394355136))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(394879488))), name = tensor<string, []>("layers_23_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_k_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(542482944))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(543269440))), name = tensor<string, []>("layers_23_self_attn_k_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9031_cast_fp16 = conv(dilations = var_9031_dilations_0, groups = var_9031_groups_0, pad = var_9031_pad_0, pad_type = var_9031_pad_type_0, strides = var_9031_strides_0, weight = layers_23_self_attn_k_proj_inlier_module_weight_to_fp16_palettized, x = obj_95_cast_fp16)[name = tensor<string, []>("op_9031_cast_fp16")];
             tensor<string, []> var_9037_pad_type_0 = const()[name = tensor<string, []>("op_9037_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9037_strides_0 = const()[name = tensor<string, []>("op_9037_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9037_pad_0 = const()[name = tensor<string, []>("op_9037_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9037_dilations_0 = const()[name = tensor<string, []>("op_9037_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9037_groups_0 = const()[name = tensor<string, []>("op_9037_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(394909056))), name = tensor<string, []>("layers_23_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14687]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(394879616))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(543299072))), name = tensor<string, []>("layers_23_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14687]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(543269632))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9037_cast_fp16 = conv(dilations = var_9037_dilations_0, groups = var_9037_groups_0, pad = var_9037_pad_0, pad_type = var_9037_pad_type_0, strides = var_9037_strides_0, weight = layers_23_self_attn_k_proj_outlier_module_weight_to_fp16_sparsified, x = obj_95_cast_fp16)[name = tensor<string, []>("op_9037_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> key_cast_fp16 = add(x = var_9031_cast_fp16, y = var_9037_cast_fp16)[name = tensor<string, []>("key_cast_fp16")];
             tensor<string, []> var_9047_pad_type_0 = const()[name = tensor<string, []>("op_9047_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6411,33 +6411,33 @@ program(1.0)
             tensor<int32, [4]> var_9047_pad_0 = const()[name = tensor<string, []>("op_9047_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9047_dilations_0 = const()[name = tensor<string, []>("op_9047_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9047_groups_0 = const()[name = tensor<string, []>("op_9047_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395040192))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395564544))), name = tensor<string, []>("layers_23_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_v_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(543430208))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(544216704))), name = tensor<string, []>("layers_23_self_attn_v_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9047_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_9047_dilations_0, groups = var_9047_groups_0, pad = var_9047_pad_0, pad_type = var_9047_pad_type_0, strides = var_9047_strides_0, weight = layers_23_self_attn_v_proj_inlier_module_weight_to_fp16_palettized, x = obj_95_cast_fp16)[name = tensor<string, []>("op_9047_cast_fp16")];
             tensor<string, []> var_9053_pad_type_0 = const()[name = tensor<string, []>("op_9053_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9053_strides_0 = const()[name = tensor<string, []>("op_9053_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9053_pad_0 = const()[name = tensor<string, []>("op_9053_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9053_dilations_0 = const()[name = tensor<string, []>("op_9053_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9053_groups_0 = const()[name = tensor<string, []>("op_9053_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395594496))), name = tensor<string, []>("layers_23_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14870]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395564672))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(544246720))), name = tensor<string, []>("layers_23_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [14870]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(544216896))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9053_cast_fp16 = conv(dilations = var_9053_dilations_0, groups = var_9053_groups_0, pad = var_9053_pad_0, pad_type = var_9053_pad_type_0, strides = var_9053_strides_0, weight = layers_23_self_attn_v_proj_outlier_module_weight_to_fp16_sparsified, x = obj_95_cast_fp16)[name = tensor<string, []>("op_9053_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> value_cast_fp16 = add(x = var_9047_cast_fp16, y = var_9053_cast_fp16)[name = tensor<string, []>("value_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_9056_to_fp16 = const()[name = tensor<string, []>("op_9056_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395725632)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_9056_to_fp16 = const()[name = tensor<string, []>("op_9056_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(544377856)))];
             tensor<fp16, [1, 1024, 1, 188]> query_cast_fp16 = add(x = query_93_cast_fp16, y = var_9056_to_fp16)[name = tensor<string, []>("query_cast_fp16")];
-            tensor<fp16, [1, 1024, 1, 1]> var_9059_to_fp16 = const()[name = tensor<string, []>("op_9059_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395727744)))];
+            tensor<fp16, [1, 1024, 1, 1]> var_9059_to_fp16 = const()[name = tensor<string, []>("op_9059_to_fp16"), val = tensor<fp16, [1, 1024, 1, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(544379968)))];
             tensor<fp16, [1, 1024, 1, 188]> q_with_bias_v_cast_fp16 = add(x = query_93_cast_fp16, y = var_9059_to_fp16)[name = tensor<string, []>("q_with_bias_v_cast_fp16")];
             tensor<string, []> var_9069_pad_type_0 = const()[name = tensor<string, []>("op_9069_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9069_strides_0 = const()[name = tensor<string, []>("op_9069_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9069_pad_0 = const()[name = tensor<string, []>("op_9069_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9069_dilations_0 = const()[name = tensor<string, []>("op_9069_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9069_groups_0 = const()[name = tensor<string, []>("op_9069_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(395729856))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396254208))), name = tensor<string, []>("layers_23_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(544382080))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(545168576))), name = tensor<string, []>("layers_23_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_9069_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_9069_dilations_0, groups = var_9069_groups_0, pad = var_9069_pad_0, pad_type = var_9069_pad_type_0, strides = var_9069_strides_0, weight = layers_23_self_attn_linear_pos_inlier_module_weight_to_fp16_palettized, x = obj_3_cast_fp16)[name = tensor<string, []>("op_9069_cast_fp16")];
             tensor<string, []> var_9075_pad_type_0 = const()[name = tensor<string, []>("op_9075_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9075_strides_0 = const()[name = tensor<string, []>("op_9075_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9075_pad_0 = const()[name = tensor<string, []>("op_9075_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9075_dilations_0 = const()[name = tensor<string, []>("op_9075_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9075_groups_0 = const()[name = tensor<string, []>("op_9075_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396335744))), name = tensor<string, []>("layers_23_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [40648]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396254336))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(545250176))), name = tensor<string, []>("layers_23_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [40648]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(545168768))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 375]> var_9075_cast_fp16 = conv(dilations = var_9075_dilations_0, groups = var_9075_groups_0, pad = var_9075_pad_0, pad_type = var_9075_pad_type_0, strides = var_9075_strides_0, weight = layers_23_self_attn_linear_pos_outlier_module_weight_to_fp16_sparsified, x = obj_3_cast_fp16)[name = tensor<string, []>("op_9075_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 375]> p_cast_fp16 = add(x = var_9069_cast_fp16, y = var_9075_cast_fp16)[name = tensor<string, []>("p_cast_fp16")];
             tensor<int32, [4]> var_9079 = const()[name = tensor<string, []>("op_9079"), val = tensor<int32, [4]>([1, 8, 128, 188])];
@@ -6488,22 +6488,22 @@ program(1.0)
             tensor<int32, [4]> var_9132_pad_0 = const()[name = tensor<string, []>("op_9132_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9132_dilations_0 = const()[name = tensor<string, []>("op_9132_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9132_groups_0 = const()[name = tensor<string, []>("op_9132_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396466880))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396991232))), name = tensor<string, []>("layers_23_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_o_proj_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(545381312))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(546167808))), name = tensor<string, []>("layers_23_self_attn_o_proj_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9132_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_9132_dilations_0, groups = var_9132_groups_0, pad = var_9132_pad_0, pad_type = var_9132_pad_type_0, strides = var_9132_strides_0, weight = layers_23_self_attn_o_proj_inlier_module_weight_to_fp16_palettized, x = input_621_cast_fp16)[name = tensor<string, []>("op_9132_cast_fp16")];
             tensor<string, []> var_9138_pad_type_0 = const()[name = tensor<string, []>("op_9138_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9138_strides_0 = const()[name = tensor<string, []>("op_9138_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9138_pad_0 = const()[name = tensor<string, []>("op_9138_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9138_dilations_0 = const()[name = tensor<string, []>("op_9138_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9138_groups_0 = const()[name = tensor<string, []>("op_9138_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(397024704))), name = tensor<string, []>("layers_23_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16623]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(396991360))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(546201344))), name = tensor<string, []>("layers_23_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [16623]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(546168000))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9138_cast_fp16 = conv(dilations = var_9138_dilations_0, groups = var_9138_groups_0, pad = var_9138_pad_0, pad_type = var_9138_pad_type_0, strides = var_9138_strides_0, weight = layers_23_self_attn_o_proj_outlier_module_weight_to_fp16_sparsified, x = input_621_cast_fp16)[name = tensor<string, []>("op_9138_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> obj_cast_fp16 = add(x = var_9132_cast_fp16, y = var_9138_cast_fp16)[name = tensor<string, []>("obj_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_235_cast_fp16 = add(x = inputs_233_cast_fp16, y = obj_cast_fp16)[name = tensor<string, []>("inputs_235_cast_fp16")];
             tensor<int32, [1]> out_235_axes_0 = const()[name = tensor<string, []>("out_235_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_9149_to_fp16 = const()[name = tensor<string, []>("op_9149_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_235_cast_fp16 = layer_norm(axes = out_235_axes_0, epsilon = var_9149_to_fp16, x = inputs_235_cast_fp16)[name = tensor<string, []>("out_235_cast_fp16")];
-            tensor<fp16, [1024]> input_623_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_623_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(397155840)))];
-            tensor<fp16, [1024]> input_623_beta_0_to_fp16 = const()[name = tensor<string, []>("input_623_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(397157952)))];
+            tensor<fp16, [1024]> input_623_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_623_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(546332480)))];
+            tensor<fp16, [1024]> input_623_beta_0_to_fp16 = const()[name = tensor<string, []>("input_623_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(546334592)))];
             tensor<fp16, []> input_623_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_623_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_623_cast_fp16 = batch_norm(beta = input_623_beta_0_to_fp16, epsilon = input_623_epsilon_0_to_fp16, gamma = input_623_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_235_cast_fp16)[name = tensor<string, []>("input_623_cast_fp16")];
             tensor<string, []> var_9170_pad_type_0 = const()[name = tensor<string, []>("op_9170_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6511,14 +6511,14 @@ program(1.0)
             tensor<int32, [4]> var_9170_pad_0 = const()[name = tensor<string, []>("op_9170_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9170_dilations_0 = const()[name = tensor<string, []>("op_9170_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9170_groups_0 = const()[name = tensor<string, []>("op_9170_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_23_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1048576]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(397160064))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(398208704))), name = tensor<string, []>("layers_23_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_23_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [1572864]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(546336704))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(547909632))), name = tensor<string, []>("layers_23_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_9170_cast_fp16 = conv(dilations = var_9170_dilations_0, groups = var_9170_groups_0, pad = var_9170_pad_0, pad_type = var_9170_pad_type_0, strides = var_9170_strides_0, weight = layers_23_conv_pointwise_conv1_inlier_module_weight_to_fp16_palettized, x = input_623_cast_fp16)[name = tensor<string, []>("op_9170_cast_fp16")];
             tensor<string, []> var_9176_pad_type_0 = const()[name = tensor<string, []>("op_9176_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9176_strides_0 = const()[name = tensor<string, []>("op_9176_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9176_pad_0 = const()[name = tensor<string, []>("op_9176_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9176_dilations_0 = const()[name = tensor<string, []>("op_9176_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9176_groups_0 = const()[name = tensor<string, []>("op_9176_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [2048, 1024, 1, 1]> layers_23_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(398278464))), name = tensor<string, []>("layers_23_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [34784]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(398208832))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
+            tensor<fp16, [2048, 1024, 1, 1]> layers_23_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [262144]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(547979456))), name = tensor<string, []>("layers_23_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [34784]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(547909824))), shape = tensor<uint32, [4]>([2048, 1024, 1, 1])];
             tensor<fp16, [1, 2048, 1, 188]> var_9176_cast_fp16 = conv(dilations = var_9176_dilations_0, groups = var_9176_groups_0, pad = var_9176_pad_0, pad_type = var_9176_pad_type_0, strides = var_9176_strides_0, weight = layers_23_conv_pointwise_conv1_outlier_module_weight_to_fp16_sparsified, x = input_623_cast_fp16)[name = tensor<string, []>("op_9176_cast_fp16")];
             tensor<fp16, [1, 2048, 1, 188]> input_625_cast_fp16 = add(x = var_9170_cast_fp16, y = var_9176_cast_fp16)[name = tensor<string, []>("input_625_cast_fp16")];
             tensor<int32, []> input_627_split_num_splits_0 = const()[name = tensor<string, []>("input_627_split_num_splits_0"), val = tensor<int32, []>(2)];
@@ -6531,8 +6531,8 @@ program(1.0)
             tensor<int32, []> input_629_groups_0 = const()[name = tensor<string, []>("input_629_groups_0"), val = tensor<int32, []>(1024)];
             tensor<int32, [2]> input_629_strides_0 = const()[name = tensor<string, []>("input_629_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [2]> input_629_dilations_0 = const()[name = tensor<string, []>("input_629_dilations_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<fp16, [1024, 1, 1, 9]> const_314_to_fp16 = const()[name = tensor<string, []>("const_314_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(398540672)))];
-            tensor<fp16, [1024]> const_315_to_fp16 = const()[name = tensor<string, []>("const_315_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(398559168)))];
+            tensor<fp16, [1024, 1, 1, 9]> const_314_to_fp16 = const()[name = tensor<string, []>("const_314_to_fp16"), val = tensor<fp16, [1024, 1, 1, 9]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(548241664)))];
+            tensor<fp16, [1024]> const_315_to_fp16 = const()[name = tensor<string, []>("const_315_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(548260160)))];
             tensor<fp16, [1, 1024, 1, 188]> input_631_cast_fp16 = conv(bias = const_315_to_fp16, dilations = input_629_dilations_0, groups = input_629_groups_0, pad = input_629_pad_0, pad_type = input_629_pad_type_0, strides = input_629_strides_0, weight = const_314_to_fp16, x = input_627_cast_fp16)[name = tensor<string, []>("input_631_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> input_633_cast_fp16 = silu(x = input_631_cast_fp16)[name = tensor<string, []>("input_633_cast_fp16")];
             tensor<string, []> var_9198_pad_type_0 = const()[name = tensor<string, []>("op_9198_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6540,22 +6540,22 @@ program(1.0)
             tensor<int32, [4]> var_9198_pad_0 = const()[name = tensor<string, []>("op_9198_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9198_dilations_0 = const()[name = tensor<string, []>("op_9198_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9198_groups_0 = const()[name = tensor<string, []>("op_9198_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(398561280))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(399085632))), name = tensor<string, []>("layers_23_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [786432]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(548262272))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(549048768))), name = tensor<string, []>("layers_23_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9198_cast_fp16 = conv(dilations = var_9198_dilations_0, groups = var_9198_groups_0, pad = var_9198_pad_0, pad_type = var_9198_pad_type_0, strides = var_9198_strides_0, weight = layers_23_conv_pointwise_conv2_inlier_module_weight_to_fp16_palettized, x = input_633_cast_fp16)[name = tensor<string, []>("op_9198_cast_fp16")];
             tensor<string, []> var_9204_pad_type_0 = const()[name = tensor<string, []>("op_9204_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9204_strides_0 = const()[name = tensor<string, []>("op_9204_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9204_pad_0 = const()[name = tensor<string, []>("op_9204_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9204_dilations_0 = const()[name = tensor<string, []>("op_9204_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9204_groups_0 = const()[name = tensor<string, []>("op_9204_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 1024, 1, 1]> layers_23_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(399122304))), name = tensor<string, []>("layers_23_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18238]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(399085760))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
+            tensor<fp16, [1024, 1024, 1, 1]> layers_23_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [131072]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(549085504))), name = tensor<string, []>("layers_23_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [18238]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(549048960))), shape = tensor<uint32, [4]>([1024, 1024, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9204_cast_fp16 = conv(dilations = var_9204_dilations_0, groups = var_9204_groups_0, pad = var_9204_pad_0, pad_type = var_9204_pad_type_0, strides = var_9204_strides_0, weight = layers_23_conv_pointwise_conv2_outlier_module_weight_to_fp16_sparsified, x = input_633_cast_fp16)[name = tensor<string, []>("op_9204_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_143_cast_fp16 = add(x = var_9198_cast_fp16, y = var_9204_cast_fp16)[name = tensor<string, []>("x_143_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> inputs_237_cast_fp16 = add(x = inputs_235_cast_fp16, y = x_143_cast_fp16)[name = tensor<string, []>("inputs_237_cast_fp16")];
             tensor<int32, [1]> out_237_axes_0 = const()[name = tensor<string, []>("out_237_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_9215_to_fp16 = const()[name = tensor<string, []>("op_9215_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_237_cast_fp16 = layer_norm(axes = out_237_axes_0, epsilon = var_9215_to_fp16, x = inputs_237_cast_fp16)[name = tensor<string, []>("out_237_cast_fp16")];
-            tensor<fp16, [1024]> input_635_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_635_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(399253440)))];
-            tensor<fp16, [1024]> input_635_beta_0_to_fp16 = const()[name = tensor<string, []>("input_635_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(399255552)))];
+            tensor<fp16, [1024]> input_635_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_635_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(549216640)))];
+            tensor<fp16, [1024]> input_635_beta_0_to_fp16 = const()[name = tensor<string, []>("input_635_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(549218752)))];
             tensor<fp16, []> input_635_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_635_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_635_cast_fp16 = batch_norm(beta = input_635_beta_0_to_fp16, epsilon = input_635_epsilon_0_to_fp16, gamma = input_635_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_237_cast_fp16)[name = tensor<string, []>("input_635_cast_fp16")];
             tensor<string, []> var_9235_pad_type_0 = const()[name = tensor<string, []>("op_9235_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6563,14 +6563,14 @@ program(1.0)
             tensor<int32, [4]> var_9235_pad_0 = const()[name = tensor<string, []>("op_9235_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9235_dilations_0 = const()[name = tensor<string, []>("op_9235_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9235_groups_0 = const()[name = tensor<string, []>("op_9235_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_23_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(399257664))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(401354880))), name = tensor<string, []>("layers_23_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_23_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(549220864))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(552366656))), name = tensor<string, []>("layers_23_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_9235_cast_fp16 = conv(bias = layers_0_feed_forward1_fc1_inlier_module_bias_to_fp16, dilations = var_9235_dilations_0, groups = var_9235_groups_0, pad = var_9235_pad_0, pad_type = var_9235_pad_type_0, strides = var_9235_strides_0, weight = layers_23_feed_forward2_fc1_inlier_module_weight_to_fp16_palettized, x = input_635_cast_fp16)[name = tensor<string, []>("op_9235_cast_fp16")];
             tensor<string, []> var_9241_pad_type_0 = const()[name = tensor<string, []>("op_9241_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9241_strides_0 = const()[name = tensor<string, []>("op_9241_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9241_pad_0 = const()[name = tensor<string, []>("op_9241_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9241_dilations_0 = const()[name = tensor<string, []>("op_9241_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9241_groups_0 = const()[name = tensor<string, []>("op_9241_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [4096, 1024, 1, 1]> layers_23_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(401484672))), name = tensor<string, []>("layers_23_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [64791]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(401355008))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
+            tensor<fp16, [4096, 1024, 1, 1]> layers_23_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(552496512))), name = tensor<string, []>("layers_23_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [64791]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(552366848))), shape = tensor<uint32, [4]>([4096, 1024, 1, 1])];
             tensor<fp16, [1, 4096, 1, 188]> var_9241_cast_fp16 = conv(dilations = var_9241_dilations_0, groups = var_9241_groups_0, pad = var_9241_pad_0, pad_type = var_9241_pad_type_0, strides = var_9241_strides_0, weight = layers_23_feed_forward2_fc1_outlier_module_weight_to_fp16_sparsified, x = input_635_cast_fp16)[name = tensor<string, []>("op_9241_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_637_cast_fp16 = add(x = var_9235_cast_fp16, y = var_9241_cast_fp16)[name = tensor<string, []>("input_637_cast_fp16")];
             tensor<fp16, [1, 4096, 1, 188]> input_639_cast_fp16 = silu(x = input_637_cast_fp16)[name = tensor<string, []>("input_639_cast_fp16")];
@@ -6579,14 +6579,14 @@ program(1.0)
             tensor<int32, [4]> var_9252_pad_0 = const()[name = tensor<string, []>("op_9252_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9252_dilations_0 = const()[name = tensor<string, []>("op_9252_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9252_groups_0 = const()[name = tensor<string, []>("op_9252_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_23_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [2097152]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(402009024))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404106240))), name = tensor<string, []>("layers_23_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_23_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [3145728]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(553020864))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(556166656))), name = tensor<string, []>("layers_23_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9252_cast_fp16 = conv(bias = input_17_mean_0_to_fp16, dilations = var_9252_dilations_0, groups = var_9252_groups_0, pad = var_9252_pad_0, pad_type = var_9252_pad_type_0, strides = var_9252_strides_0, weight = layers_23_feed_forward2_fc2_inlier_module_weight_to_fp16_palettized, x = input_639_cast_fp16)[name = tensor<string, []>("op_9252_cast_fp16")];
             tensor<string, []> var_9258_pad_type_0 = const()[name = tensor<string, []>("op_9258_pad_type_0"), val = tensor<string, []>("valid")];
             tensor<int32, [2]> var_9258_strides_0 = const()[name = tensor<string, []>("op_9258_strides_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, [4]> var_9258_pad_0 = const()[name = tensor<string, []>("op_9258_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9258_dilations_0 = const()[name = tensor<string, []>("op_9258_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9258_groups_0 = const()[name = tensor<string, []>("op_9258_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [1024, 4096, 1, 1]> layers_23_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404284288))), name = tensor<string, []>("layers_23_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [88901]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404106368))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
+            tensor<fp16, [1024, 4096, 1, 1]> layers_23_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified = constexpr_sparse_to_dense()[mask = tensor<uint8, [524288]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(556344768))), name = tensor<string, []>("layers_23_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified"), nonzero_data = tensor<fp16, [88901]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(556166848))), shape = tensor<uint32, [4]>([1024, 4096, 1, 1])];
             tensor<fp16, [1, 1024, 1, 188]> var_9258_cast_fp16 = conv(dilations = var_9258_dilations_0, groups = var_9258_groups_0, pad = var_9258_pad_0, pad_type = var_9258_pad_type_0, strides = var_9258_strides_0, weight = layers_23_feed_forward2_fc2_outlier_module_weight_to_fp16_sparsified, x = input_639_cast_fp16)[name = tensor<string, []>("op_9258_cast_fp16")];
             tensor<fp16, [1, 1024, 1, 188]> x_cast_fp16 = add(x = var_9252_cast_fp16, y = var_9258_cast_fp16)[name = tensor<string, []>("x_cast_fp16")];
             tensor<fp16, []> var_9260_to_fp16 = const()[name = tensor<string, []>("op_9260_to_fp16"), val = tensor<fp16, []>(0x1p-1)];
@@ -6595,8 +6595,8 @@ program(1.0)
             tensor<int32, [1]> out_239_axes_0 = const()[name = tensor<string, []>("out_239_axes_0"), val = tensor<int32, [1]>([1])];
             tensor<fp16, []> var_9271_to_fp16 = const()[name = tensor<string, []>("op_9271_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> out_239_cast_fp16 = layer_norm(axes = out_239_axes_0, epsilon = var_9271_to_fp16, x = inputs_cast_fp16)[name = tensor<string, []>("out_239_cast_fp16")];
-            tensor<fp16, [1024]> input_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404808640)))];
-            tensor<fp16, [1024]> input_beta_0_to_fp16 = const()[name = tensor<string, []>("input_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404810752)))];
+            tensor<fp16, [1024]> input_gamma_0_to_fp16 = const()[name = tensor<string, []>("input_gamma_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(556869120)))];
+            tensor<fp16, [1024]> input_beta_0_to_fp16 = const()[name = tensor<string, []>("input_beta_0_to_fp16"), val = tensor<fp16, [1024]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(556871232)))];
             tensor<fp16, []> input_epsilon_0_to_fp16 = const()[name = tensor<string, []>("input_epsilon_0_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
             tensor<fp16, [1, 1024, 1, 188]> input_cast_fp16 = batch_norm(beta = input_beta_0_to_fp16, epsilon = input_epsilon_0_to_fp16, gamma = input_gamma_0_to_fp16, mean = input_17_mean_0_to_fp16, variance = input_17_variance_0_to_fp16, x = out_239_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
             tensor<string, []> var_9291_pad_type_0 = const()[name = tensor<string, []>("op_9291_pad_type_0"), val = tensor<string, []>("valid")];
@@ -6604,8 +6604,8 @@ program(1.0)
             tensor<int32, [4]> var_9291_pad_0 = const()[name = tensor<string, []>("op_9291_pad_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
             tensor<int32, [2]> var_9291_dilations_0 = const()[name = tensor<string, []>("op_9291_dilations_0"), val = tensor<int32, [2]>([1, 1])];
             tensor<int32, []> var_9291_groups_0 = const()[name = tensor<string, []>("op_9291_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [16385, 1024, 1, 1]> ctc_head_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [8389120]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(404812864))), lut = tensor<fp16, [16]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(413202048))), name = tensor<string, []>("ctc_head_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([16385, 1024, 1, 1])];
-            tensor<fp16, [16385]> ctc_head_bias_to_fp16 = const()[name = tensor<string, []>("ctc_head_bias_to_fp16"), val = tensor<fp16, [16385]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(413202176)))];
+            tensor<fp16, [16385, 1024, 1, 1]> ctc_head_weight_to_fp16_palettized = constexpr_lut_to_dense()[indices = tensor<uint8, [12583680]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(556873344))), lut = tensor<fp16, [64]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(569457088))), name = tensor<string, []>("ctc_head_weight_to_fp16_palettized"), shape = tensor<uint32, [4]>([16385, 1024, 1, 1])];
+            tensor<fp16, [16385]> ctc_head_bias_to_fp16 = const()[name = tensor<string, []>("ctc_head_bias_to_fp16"), val = tensor<fp16, [16385]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(569457280)))];
             tensor<fp16, [1, 16385, 1, 188]> ctc_head_raw_output = conv(bias = ctc_head_bias_to_fp16, dilations = var_9291_dilations_0, groups = var_9291_groups_0, pad = var_9291_pad_0, pad_type = var_9291_pad_type_0, strides = var_9291_strides_0, weight = ctc_head_weight_to_fp16_palettized, x = input_cast_fp16)[name = tensor<string, []>("op_9291_cast_fp16")];
         } -> (ctc_head_raw_output);
 }
\ No newline at end of file