optional-skills/mlops/llava/references/training.md
Guide to training and fine-tuning LLaVA models.
Purpose: Align vision encoder with language model
Data: 558K image-caption pairs (CC3M subset)
# Download pretrained projector or train from scratch
bash scripts/v1_5/pretrain.sh
Configuration:
Purpose: Teach model to follow visual instructions
Data: 150K GPT-generated multimodal instruction data
# Fine-tune with instruction data
bash scripts/v1_5/finetune.sh
Configuration:
[
{
"id": "001",
"image": "path/to/image.jpg",
"conversations": [
{
"from": "human",
"value": "<image>\nWhat is in this image?"
},
{
"from": "gpt",
"value": "The image shows a dog playing in a park."
},
{
"from": "human",
"value": "What breed is the dog?"
},
{
"from": "gpt",
"value": "It appears to be a Golden Retriever."
}
]
}
]
import json
# Create instruction data
data = []
for image_path, qa_pairs in your_dataset:
conversations = []
for q, a in qa_pairs:
conversations.append({"from": "human", "value": f"<image>\n{q}"})
conversations.append({"from": "gpt", "value": a})
data.append({
"id": str(len(data)),
"image": image_path,
"conversations": conversations
})
# Save
with open("custom_data.json", "w") as f:
json.dump(data, f, indent=2)
#!/bin/bash
# Set paths
DATA_PATH="custom_data.json"
IMAGE_FOLDER="path/to/images"
MODEL_PATH="liuhaotian/llava-v1.5-7b"
OUTPUT_DIR="./checkpoints/llava-custom"
# Fine-tune
deepspeed llava/train/train_mem.py \
--deepspeed ./scripts/zero2.json \
--model_name_or_path $MODEL_PATH \
--version v1 \
--data_path $DATA_PATH \
--image_folder $IMAGE_FOLDER \
--vision_tower openai/clip-vit-large-patch14-336 \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio pad \
--group_by_modality_length True \
--bf16 True \
--output_dir $OUTPUT_DIR \
--num_train_epochs 1 \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 50000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type "cosine" \
--logging_steps 1 \
--tf32 True \
--model_max_length 2048 \
--gradient_checkpointing True \
--dataloader_num_workers 4 \
--lazy_preprocess True \
--report_to wandb
from peft import LoraConfig, get_peft_model
# LoRA config
lora_config = LoraConfig(
r=8, # LoRA rank
lora_alpha=16,
target_modules=["q_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
# Apply LoRA
model = get_peft_model(base_model, lora_config)
# Train with much lower memory