# configuration_deepseekocr.py # ------------------------------------------------------------ # Configuration class for the Deepseek-OCR model # ------------------------------------------------------------ from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) DEEPSEEK_OCR_PRETRAINED_CONFIG_ARCHIVE_MAP = {} class DeepseekOCRConfig(PretrainedConfig): """ Configuration class for the Deepseek-OCR model. Allows Hugging Face Auto classes to recognize this custom architecture. Example: ```python from transformers import AutoConfig config = AutoConfig.from_pretrained("specsGuy/Deepseek-ocr", trust_remote_code=True) print(config.model_type) # deepseekocr ``` """ model_type = "deepseekocr" # ✅ REQUIRED for AutoConfig detection def __init__( self, vocab_size=129280, hidden_size=1280, intermediate_size=6848, num_hidden_layers=12, num_attention_heads=10, num_key_value_heads=10, max_position_embeddings=8192, moe_intermediate_size=896, n_group=1, n_routed_experts=64, n_shared_experts=2, num_experts_per_tok=6, first_k_dense_replace=1, topk_group=1, topk_method="greedy", use_mla=False, bos_token_id=0, eos_token_id=1, torch_dtype="bfloat16", projector_config=None, vision_config=None, language_config=None, **kwargs, ): # Core architecture settings self.vocab_size = vocab_size self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.max_position_embeddings = max_position_embeddings self.moe_intermediate_size = moe_intermediate_size # Mixture of Experts & attention params self.n_group = n_group self.n_routed_experts = n_routed_experts self.n_shared_experts = n_shared_experts self.num_experts_per_tok = num_experts_per_tok self.first_k_dense_replace = first_k_dense_replace self.topk_group = topk_group self.topk_method = topk_method self.use_mla = use_mla # Tokens & dtype self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.torch_dtype = torch_dtype # Subconfigs (for multimodal alignment) self.projector_config = projector_config self.vision_config = vision_config self.language_config = language_config super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs, ) logger.info("✅ Initialized DeepseekOCRConfig successfully.")