In [2]:
import torch
print(torch.cuda.is_available())

True


In [3]:
%pip install torch transformers peft bitsandbytes trl datasets accelerate jsonlines

Note: you may need to restart the kernel to use updated packages.


## 라이브러리 설정

In [4]:
import torch  # ← 추가!

# 데이터 로드
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    TaskType,
)
from trl import SFTTrainer, SFTConfig

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
pwd

'/home/codeit01team'

# 1. 기본 QLoRA 설정 (HuggingFace PEFT + BitsAndBytes)

In [6]:
# 1. 4-bit 양자화 설정
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [7]:
# 2. 모델 로드
model_name = "beomi/Llama-3-Open-Ko-8B"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards: 100%|██████████| 6/6 [02:13<00:00, 22.28s/it]


In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


In [9]:
# 3. kbit 학습을 위한 모델 준비
model = prepare_model_for_kbit_training(model)

In [10]:
# 4. LoRA 설정 (핵심 모듈만 - 메모리 절약)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        # gate_proj, up_proj, down_proj 제거 - 성능 차이 크지 않음
    ]
)

In [11]:
# 5. LoRA 적용
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 13,631,488 || all params: 8,043,892,736 || trainable%: 0.1695


# 2. 데이터셋 준비 및 학습

`HuggingFace Dataset`을 사용해야하는 이유

- SFTTrainer가 HuggingFace Dataset을 입력으로 받음
- 자동으로 batching, shuffling, tokenization 처리
- 메모리 효율적 (lazy loading)

In [12]:
# 데이터 로드 (streaming)
data_path = "data/sft_train_llama.jsonl"
dataset = load_dataset(
    "json",
    data_files=data_path,
    split="train", 
)

In [13]:
# 학습 설정 - 메모리 최적화
sft_config = SFTConfig(
    output_dir="./qlora_output",
    num_train_epochs=2,                 # ← 주석처리 또는 삭제
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    bf16=True,
    logging_steps=10,
    save_steps=500,                       # ← epoch 대신 step 기준 저장
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    max_length=512,           
    dataset_text_field="text",
)

In [14]:
# SFTTrainer로 학습
trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=dataset,
    processing_class=tokenizer,
)

In [15]:
# 학습 시작
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128001, 'bos_token_id': 128000, 'pad_token_id': 128001}.


Step,Training Loss
10,2.5251
20,1.9333
30,1.8007
40,1.7757
50,1.7737
60,1.7195
70,1.7159
80,1.6813
90,1.671
100,1.6901


TrainOutput(global_step=1058, training_loss=1.4723652360119306, metrics={'train_runtime': 22771.7007, 'train_samples_per_second': 0.743, 'train_steps_per_second': 0.046, 'total_flos': 2.071327636721664e+17, 'train_loss': 1.4723652360119306, 'entropy': 1.4031210680802664, 'num_tokens': 4591590.0, 'mean_token_accuracy': 0.6802353163560232, 'epoch': 2.0})

In [16]:
# 모델 저장 (LoRA 가중치만)
trainer.model.save_pretrained("./qlora_adapter")
tokenizer.save_pretrained("./qlora_adapter")

('./qlora_adapter/tokenizer_config.json',
 './qlora_adapter/special_tokens_map.json',
 './qlora_adapter/chat_template.jinja',
 './qlora_adapter/tokenizer.json')

In [None]:
# dataset = dataset.shuffle(seed=42).select(range(1000))