specsGuy commited on
Commit
308e34c
·
verified ·
1 Parent(s): 93a788b

Update configuration_deepseekocr.py

Browse files
Files changed (1) hide show
  1. configuration_deepseekocr.py +35 -61
configuration_deepseekocr.py CHANGED
@@ -5,90 +5,64 @@
5
 
6
  from transformers.configuration_utils import PretrainedConfig
7
  from transformers.utils import logging
 
8
 
9
  logger = logging.get_logger(__name__)
10
 
11
  DEEPSEEK_OCR_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
12
 
13
-
14
- class DeepseekOCRConfig(PretrainedConfig):
15
  """
16
- Configuration class for the Deepseek-OCR model.
17
- Allows Hugging Face Auto classes to recognize this custom architecture.
 
 
 
18
 
19
- Example:
20
- ```python
21
- from transformers import AutoConfig
22
- config = AutoConfig.from_pretrained("specsGuy/Deepseek-ocr", trust_remote_code=True)
23
- print(config.model_type) # deepseekocr
24
- ```
25
  """
26
 
27
- model_type = "deepseekocr" # ✅ REQUIRED for AutoConfig detection
28
 
29
  def __init__(
30
  self,
31
- vocab_size=129280,
32
- hidden_size=1280,
33
- intermediate_size=6848,
34
- num_hidden_layers=12,
35
- num_attention_heads=10,
36
- num_key_value_heads=10,
37
- max_position_embeddings=8192,
38
- moe_intermediate_size=896,
39
- n_group=1,
40
- n_routed_experts=64,
41
- n_shared_experts=2,
42
- num_experts_per_tok=6,
43
- first_k_dense_replace=1,
44
- topk_group=1,
45
- topk_method="greedy",
46
- use_mla=False,
47
- bos_token_id=0,
48
- eos_token_id=1,
49
- torch_dtype="bfloat16",
50
  projector_config=None,
51
  vision_config=None,
52
  language_config=None,
 
53
  **kwargs,
54
  ):
55
- # Core architecture settings
56
- self.vocab_size = vocab_size
57
- self.hidden_size = hidden_size
58
- self.intermediate_size = intermediate_size
59
- self.num_hidden_layers = num_hidden_layers
60
- self.num_attention_heads = num_attention_heads
61
- self.num_key_value_heads = num_key_value_heads
62
- self.max_position_embeddings = max_position_embeddings
63
- self.moe_intermediate_size = moe_intermediate_size
64
- self.attention_dropout = kwargs.get("attention_dropout", 0.0)
65
- self.attention_bias = kwargs.get("attention_bias", False)
66
 
 
 
 
 
 
 
67
 
68
- # Mixture of Experts & attention params
69
- self.n_group = n_group
70
- self.n_routed_experts = n_routed_experts
71
- self.n_shared_experts = n_shared_experts
72
- self.num_experts_per_tok = num_experts_per_tok
73
- self.first_k_dense_replace = first_k_dense_replace
74
- self.topk_group = topk_group
75
- self.topk_method = topk_method
76
- self.use_mla = use_mla
77
 
78
- # Tokens & dtype
79
- self.bos_token_id = bos_token_id
80
- self.eos_token_id = eos_token_id
81
- self.torch_dtype = torch_dtype
82
 
83
- # Subconfigs (for multimodal alignment)
84
  self.projector_config = projector_config
85
  self.vision_config = vision_config
86
  self.language_config = language_config
87
 
88
- super().__init__(
89
- bos_token_id=bos_token_id,
90
- eos_token_id=eos_token_id,
91
- **kwargs,
92
- )
93
 
94
- logger.info("✅ Initialized DeepseekOCRConfig successfully.")
 
5
 
6
  from transformers.configuration_utils import PretrainedConfig
7
  from transformers.utils import logging
8
+ from .configuration_deepseek_v2 import DeepseekV2Config
9
 
10
  logger = logging.get_logger(__name__)
11
 
12
  DEEPSEEK_OCR_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
13
 
14
+ class DeepseekOCRConfig(DeepseekV2Config):
 
15
  """
16
+ Config for Deepseek-OCR.
17
+
18
+ It extends DeepseekV2Config (the language model config) and adds
19
+ OCR / vision-specific fields: projector_config, vision_config,
20
+ candidate_resolutions, etc.
21
 
22
+ This lets DeepseekOCRModel (which subclasses DeepseekV2Model)
23
+ see ALL the attributes it expects (hidden_act, attention_bias, etc.)
24
+ while still letting us keep multimodal metadata.
 
 
 
25
  """
26
 
27
+ model_type = "deepseekocr"
28
 
29
  def __init__(
30
  self,
31
+ # OCR / vision specific
32
+ candidate_resolutions=None,
33
+ global_view_pos="head",
34
+ tile_tag="2D",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  projector_config=None,
36
  vision_config=None,
37
  language_config=None,
38
+ torch_dtype="bfloat16",
39
  **kwargs,
40
  ):
41
+ """
42
+ Parameters that are *not* listed here (hidden_size, num_attention_heads,
43
+ num_hidden_layers, hidden_act, attention_bias, etc.) are passed via
44
+ **kwargs and handled by DeepseekV2Config.__init__.
45
+ """
 
 
 
 
 
 
46
 
47
+ # If a nested language_config is provided (like in your config.json),
48
+ # use it as a base and let top-level kwargs override it.
49
+ if language_config is not None and isinstance(language_config, dict):
50
+ base = dict(language_config)
51
+ base.update(kwargs)
52
+ kwargs = base
53
 
54
+ # Let DeepseekV2Config set all LM-related fields:
55
+ super().__init__(torch_dtype=torch_dtype, **kwargs)
 
 
 
 
 
 
 
56
 
57
+ # OCR-specific / multimodal metadata
58
+ self.candidate_resolutions = candidate_resolutions or [[1024, 1024]]
59
+ self.global_view_pos = global_view_pos
60
+ self.tile_tag = tile_tag
61
 
62
+ # Keep the sub-configs so modeling_deepseekocr can read them if needed
63
  self.projector_config = projector_config
64
  self.vision_config = vision_config
65
  self.language_config = language_config
66
 
67
+ logger.info("✅ DeepseekOCRConfig initialized (inherits DeepseekV2Config).")
 
 
 
 
68