Spaces:

Smilyai-labs
/

VISION-LLM-COT

Sleeping

App Files Files Community

VISION-LLM-COT / train_vlm.py

Keeby-smilyai

Update train_vlm.py

cb3d04d verified 3 months ago

raw

history blame contribute delete

6.19 kB

	# train_vlm.py
	import os
	import torch
	from transformers import (
	TrainingArguments, Trainer, DefaultDataCollator,
	AutoTokenizer, AutoImageProcessor
	)
	from datasets import load_dataset
	from PIL import Image
	from custom_vlm import CustomScratchVLM, VLMConfig

	def get_processors_and_model(config):
	vision_model_name = config.vision_config._name_or_path
	language_model_name = config.language_config._name_or_path

	image_processor = AutoImageProcessor.from_pretrained(vision_model_name)
	tokenizer = AutoTokenizer.from_pretrained(language_model_name)

	IMAGE_TOKEN = "<IMAGE>"
	tokenizer.add_special_tokens({"additional_special_tokens": [IMAGE_TOKEN]})
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	config.language_config.vocab_size = len(tokenizer)
	model = CustomScratchVLM(config)
	model.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)

	return image_processor, tokenizer, model

	def load_and_prepare_dataset(stage, image_processor, tokenizer, model, split="train[:200]"):
	# --- USING THE DATASET YOU SPECIFIED ---
	print("Loading dataset 'zera09/lmarena-ai_VisionArena-Chat-en'...")
	dataset = load_dataset("zera09/lmarena-ai_VisionArena-Chat-en", split=split)
	print("Dataset loaded successfully.")

	IMAGE_TOKEN = "<IMAGE>"
	TEXT_MAX_LENGTH = 256
	NUM_IMAGE_PATCHES = (image_processor.size['height'] // image_processor.patch_size) ** 2
	FINAL_MAX_LENGTH = TEXT_MAX_LENGTH - 1 + NUM_IMAGE_PATCHES

	def preprocess_function(examples):
	image = examples['image'].convert("RGB")
	# --- USING THE CONVERSATION FORMAT YOU PROVIDED ---
	# We select 'conversation_a' and parse it as a list of lists of dicts.
	conversation = examples['conversation']

	full_text = ""
	is_first_user_turn = True
	for turn_list in conversation:
	if not turn_list: continue
	turn = turn_list[0]

	role = turn['role'].upper()
	content = turn['content']

	if role == "USER" and is_first_user_turn:
	full_text += f"USER: {IMAGE_TOKEN}\n{content}\n"
	is_first_user_turn = False
	else:
	full_text += f"{role}: {content}\n"

	full_text += tokenizer.eos_token
	tokenized = tokenizer(full_text, max_length=TEXT_MAX_LENGTH, truncation=True)
	input_ids = torch.tensor(tokenized.input_ids)

	try:
	image_token_idx = torch.where(input_ids == model.image_token_id)[0][0].item()
	except IndexError:
	return None

	labels = input_ids.clone()
	assistant_marker_ids = tokenizer("ASSISTANT:", add_special_tokens=False).input_ids
	is_assistant_section = torch.zeros_like(labels, dtype=torch.bool)

	for i in range(len(labels) - len(assistant_marker_ids) + 1):
	if (labels[i:i+len(assistant_marker_ids)] == torch.tensor(assistant_marker_ids)).all():
	end_idx = len(labels)
	user_marker_ids = tokenizer("USER:", add_special_tokens=False).input_ids
	for j in range(i + 1, len(labels) - len(user_marker_ids) + 1):
	if (labels[j:j+len(user_marker_ids)] == torch.tensor(user_marker_ids)).all():
	end_idx = j
	break
	is_assistant_section[i:end_idx] = True

	labels[~is_assistant_section] = -100

	pre_labels = labels[:image_token_idx]
	post_labels = labels[image_token_idx+1:]
	image_labels_pad = torch.full((NUM_IMAGE_PATCHES,), -100, dtype=torch.long)

	final_labels = torch.cat([pre_labels, image_labels_pad, post_labels], dim=0)
	final_labels = torch.nn.functional.pad(final_labels, (0, FINAL_MAX_LENGTH - len(final_labels)), value=-100)[:FINAL_MAX_LENGTH]

	attention_mask = torch.ones_like(input_ids)
	pre_mask = attention_mask[:image_token_idx]
	post_mask = attention_mask[image_token_idx+1:]
	image_mask = torch.ones(NUM_IMAGE_PATCHES, dtype=torch.long)

	final_attention_mask = torch.cat([pre_mask, image_mask, post_mask], dim=0)
	final_attention_mask = torch.nn.functional.pad(final_attention_mask, (0, FINAL_MAX_LENGTH - len(final_attention_mask)), value=0)[:FINAL_MAX_LENGTH]

	pixel_values = image_processor(image, return_tensors="pt").pixel_values

	return {
	"pixel_values": pixel_values.squeeze(0),
	"input_ids": input_ids,
	"attention_mask": final_attention_mask,
	"labels": final_labels
	}

	processed_dataset = dataset.map(preprocess_function, remove_columns=list(dataset.column_names))
	return processed_dataset.filter(lambda x: x is not None)

	def train_vlm_stage(stage, output_dir, resume_from=None):
	print(f"🚀 Starting VLM Conversational Training Stage {stage} FROM SCRATCH...")
	device = "cuda" if torch.cuda.is_available() else "cpu"

	vlm_config = VLMConfig()
	image_processor, tokenizer, model = get_processors_and_model(vlm_config)
	model.to(device)

	split = f"train[{200(stage-1)}:{200stage}]"
	tokenized_dataset = load_and_prepare_dataset(stage, image_processor, tokenizer, model, split=split)

	training_args = TrainingArguments(
	output_dir=output_dir,
	per_device_train_batch_size=1,
	gradient_accumulation_steps=8,
	num_train_epochs=3,
	learning_rate=5e-5,
	fp16=(device == "cuda"),
	bf16=(device == "cuda" and torch.cuda.is_bf16_supported()),
	save_strategy="epoch",
	logging_steps=5, report_to="none", optim="adamw_torch",
	remove_unused_columns=False,
	)

	trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset, data_collator=DefaultDataCollator())
	trainer.train(resume_from_checkpoint=resume_from)

	model.save_pretrained(output_dir)
	image_processor.save_pretrained(output_dir)
	tokenizer.save_pretrained(output_dir)
	print(f"✅ Stage {stage} model and processors saved to {output_dir}")