-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbatch_lora_codebook_gpu_multi.sh
More file actions
136 lines (124 loc) · 4.34 KB
/
Copy pathbatch_lora_codebook_gpu_multi.sh
File metadata and controls
136 lines (124 loc) · 4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env bash
set -euo pipefail
if [[ $# -lt 2 ]]; then
cat <<'USAGE'
Usage: scripts/batch_lora_codebook_gpu_multi.sh DATASET CODEBOOK_DIR [CODEBOOK_DIR ...]
DATASET must be one of: WN18RR, FB15K-237
USAGE
exit 1
fi
DATASET="$1"
case "$DATASET" in
"WN18RR"|"FB15K-237") ;;
*)
echo "Invalid dataset: ${DATASET}. Use WN18RR or FB15K-237."
exit 1
;;
esac
shift
export CUDA_VISIBLE_DEVICES="0,1,2,3"
# ==========================================
# Base models to iterate over
# ==========================================
declare -a BASE_MODELS=(
# "/nvme1n1/LLM/Mistral-7B-Instruct-v0.3"
# "/nvme1n1/LLM/deepseek-moe-16b-base"
"/nvme1n1/LLM/Qwen2.5-14B-Instruct"
"/nvme1n1/LLM/llama-2-7b-chat-hf"
"/nvme1n1/LLM/llama-2-13b-chat"
# "/nvme1n1/LLM/vicuna7b"
"/nvme1n1/LLM/vicuna-13b"
)
# ==========================================
# prefix | Target_Modules | LoRA_R | LoRA_Alpha
# ==========================================
declare -a LORA_CONFIGS=(
"sens|q_proj,v_proj|64|32"
# "v5|q_proj,v_proj|64|64"
# "v5|q_proj,v_proj|128|64"
# "v4|q_proj,v_proj|64|16"
# "v4|q_proj,v_proj|128|64"
)
CODEBOOK_DIRS=("$@")
for BASE_MODEL in "${BASE_MODELS[@]}"; do
BASE_NAME="$(basename "$BASE_MODEL")"
if [[ ! -d "$BASE_MODEL" ]]; then
echo ">>> [SKIP] Base model missing: ${BASE_MODEL}"
continue
fi
echo "############################################################"
echo ">>> Base model: ${BASE_NAME}"
echo "############################################################"
for CODEBOOK_DIR in "${CODEBOOK_DIRS[@]}"; do
CODEBOOK_DIR="${CODEBOOK_DIR%/}"
if [[ ! -d "$CODEBOOK_DIR" ]]; then continue; fi
TOKENS_FILE="${CODEBOOK_DIR}/tokens.json"
TRAIN_FILE="${CODEBOOK_DIR}/train.jsonl"
if [[ ! -f "$TOKENS_FILE" || ! -f "$TRAIN_FILE" ]]; then continue; fi
CODEBOOK_NAME="$(basename "$CODEBOOK_DIR")"
for CONFIG_STR in "${LORA_CONFIGS[@]}"; do
IFS="|" read -r PREFIX T_MODULES L_R L_ALPHA <<< "$CONFIG_STR"
TIMESTAMP=$(date +%Y%m%d_%H%M)
RUN_TAG="${BASE_NAME}_${CODEBOOK_NAME}_${PREFIX}_${TIMESTAMP}"
OUTPUT_DIR="processed_data/${DATASET}/checkpoints/LoRA_FT/${BASE_NAME}/${PREFIX}/${RUN_TAG}"
mkdir -p "$OUTPUT_DIR"
SUMMARY_FILE="${OUTPUT_DIR}/train_summary.json"
TRAIN_LOG="${OUTPUT_DIR}/train.log"
EVAL_LOG="${OUTPUT_DIR}/eval.log"
MASTER_PORT=$((10000 + RANDOM % 20000))
echo "------------------------------------------------------------"
echo ">>> Task: ${RUN_TAG}"
echo ">>> Base: ${BASE_MODEL}"
echo ">>> Config: Modules=[${T_MODULES}], R=${L_R}, Alpha=${L_ALPHA}"
echo ">>> Output: ${OUTPUT_DIR}"
echo "------------------------------------------------------------"
nohup uv run torchrun --nproc_per_node=4 --master_port=$MASTER_PORT train_lora.py \
--model_name_or_path "$BASE_MODEL" \
--tokens_file "$TOKENS_FILE" \
--train_file "$TRAIN_FILE" \
--text_column instruction \
--output_dir "$OUTPUT_DIR" \
--train_summary_file "$SUMMARY_FILE" \
--overwrite_output_dir True \
--per_device_train_batch_size 8 \
--gradient_accumulation_steps 1 \
--learning_rate 2e-4 \
--trust_remote_code True \
--source_max_len 2048 \
--target_max_len 64 \
--num_train_epochs 4.0 \
--warmup_ratio 0.03 \
--lr_scheduler_type constant \
--logging_steps 200 \
--save_steps 200 \
--do_sample True \
--save_total_limit 1 \
--logging_dir "$OUTPUT_DIR/logs" \
--bf16 True \
--lora_dropout 0.1 \
--deepspeed configs/ds_config_zero3.json \
--optim paged_adamw_32bit \
--target_modules "$T_MODULES" \
--lora_r "$L_R" \
--lora_alpha "$L_ALPHA" \
>"$TRAIN_LOG" 2>&1
echo ">>> Training finished; starting evaluation..."
uv run eval_llm.py \
--summary_config_path "$SUMMARY_FILE" \
--data_path data \
--batch_size 64 \
--max_new_tokens 64 \
--min_new_tokens 1 \
--source_max_len 2048 \
--target_max_len 64 \
--do_sample False \
--num_beams 1 \
--num_return_sequences 1 \
--model_name_or_path "$BASE_MODEL" \
>"$EVAL_LOG" 2>&1
echo ">>> Evaluation completed ✅"
echo ""
done
done
done
echo ">>> Evaluation all completed ✅"