大模型微调:让AI更懂你的业务

微调(Fine-tuning)是让预训练大模型适应特定任务的关键技术。通过在领域数据上继续训练, 可以显著提升模型在专业任务上的表现,同时保持其通用能力。

微调方法对比

方法参数量显存需求训练速度效果适用场景
全参数微调100%极高最佳资源充足
LoRA0.1-1%很好通用推荐
QLoRA0.1-1%极低中等资源受限
Prefix Tuning0.01%极低极快中等简单任务
Adapter1-5%中等多任务

LoRA微调实战

最流行的参数高效微调方法

# LoRA微调示例
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, TaskType
import torch

# 加载基础模型
model_name = "meta-llama/Llama-2-7b-hf"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# LoRA配置
lora_config = LoraConfig(
    r=16,                      # LoRA秩
    lora_alpha=32,            # LoRA缩放参数
    target_modules=[          # 目标模块
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj"
    ],
    lora_dropout=0.1,         # Dropout率
    bias="none",              # 偏置处理
    task_type=TaskType.CAUSAL_LM
)

# 应用LoRA
model = get_peft_model(model, lora_config)

# 打印可训练参数
model.print_trainable_parameters()
# trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.06%

# 准备训练数据
def prepare_dataset(examples):
    # 构建指令格式
    texts = []
    for instruction, output in zip(examples['instruction'], examples['output']):
        text = f"""### Instruction:
{instruction}

### Response:
{output}"""
        texts.append(text)
    
    # Tokenize
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )
    
    return encodings

# 训练配置
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./lora_model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=100,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="epoch"
)

# 开始训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer
)

trainer.train()

# 保存LoRA权重
model.save_pretrained("./lora_weights")

# 推理时加载
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, "./lora_weights")
model = model.merge_and_unload()  # 合并权重

QLoRA:4bit量化微调

在消费级GPU上微调大模型

# QLoRA微调 - 显存需求极低
from transformers import BitsAndBytesConfig
import torch

# 4bit量化配置
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 加载量化模型
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-13b-hf",  # 13B模型
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# QLoRA配置
qlora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

# 应用QLoRA
model = get_peft_model(model, qlora_config)

# 显存占用对比
# 全精度13B模型:~26GB
# QLoRA 13B模型:~6GB(包括梯度)

# 使用DeepSpeed进一步优化
deepspeed_config = {
    "fp16": {
        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000
    },
    "optimizer": {
        "type": "AdamW",
        "params": {
            "lr": 2e-4,
            "betas": [0.9, 0.999],
            "eps": 1e-8
        }
    },
    "scheduler": {
        "type": "WarmupLR",
        "params": {
            "warmup_min_lr": 0,
            "warmup_max_lr": 2e-4,
            "warmup_num_steps": 100
        }
    },
    "zero_optimization": {
        "stage": 2,
        "offload_optimizer": {
            "device": "cpu",
            "pin_memory": true
        },
        "allgather_partitions": true,
        "allgather_bucket_size": 2e8
    }
}

数据准备最佳实践

高质量数据是微调成功的关键

数据格式示例

[
  {
    "instruction": "将以下文本翻译成英文",
    "input": "机器学习是人工智能的一个分支",
    "output": "Machine learning is a branch of artificial intelligence"
  },
  {
    "instruction": "总结以下文本的要点",
    "input": "大语言模型通过在海量文本数据上预训练...",
    "output": "要点:1. 预训练方式 2. 数据规模 3. 应用能力"
  },
  {
    "instruction": "编写Python函数实现快速排序",
    "input": "",
    "output": "def quicksort(arr):\n    if len(arr) <= 1:..."
  }
]

数据质量要求

  • 准确性:确保标注数据的正确性
  • 多样性:覆盖各种场景和边界情况
  • 一致性:保持格式和风格统一
  • 代表性:反映真实使用场景
  • 规模适中:通常1000-10000条高质量数据

数据清洗流程

def clean_dataset(data):
    cleaned = []
    for item in data:
        # 去除空值
        if not item['instruction'] or not item['output']:
            continue
        
        # 长度过滤
        if len(item['output']) < 10 or len(item['output']) > 2000:
            continue
        
        # 去重
        text = item['instruction'] + item['output']
        if text in seen_texts:
            continue
        seen_texts.add(text)
        
        # 格式规范化
        item['instruction'] = item['instruction'].strip()
        item['output'] = item['output'].strip()
        
        cleaned.append(item)
    
    return cleaned

微调效果评估

如何评价微调模型的性能

自动评估指标

  • Perplexity:语言模型困惑度
  • BLEU:翻译任务评分
  • ROUGE:摘要任务评分
  • Accuracy:分类任务准确率
  • F1 Score:综合精确率和召回率

人工评估维度

  • 相关性:回答是否切题
  • 准确性:信息是否正确
  • 流畅性:语言是否自然
  • 完整性:回答是否全面
  • 一致性:风格是否统一

A/B测试示例

# 对比基础模型和微调模型
def compare_models(base_model, finetuned_model, test_cases):
    results = []
    
    for case in test_cases:
        base_output = generate(base_model, case['input'])
        ft_output = generate(finetuned_model, case['input'])
        
        # 自动评分
        base_score = calculate_score(base_output, case['expected'])
        ft_score = calculate_score(ft_output, case['expected'])
        
        results.append({
            'input': case['input'],
            'base_output': base_output,
            'ft_output': ft_output,
            'base_score': base_score,
            'ft_score': ft_score,
            'improvement': ft_score - base_score
        })
    
    # 统计分析
    avg_improvement = np.mean([r['improvement'] for r in results])
    win_rate = sum(1 for r in results if r['ft_score'] > r['base_score']) / len(results)
    
    print(f"平均提升: {avg_improvement:.2%}")
    print(f"胜率: {win_rate:.2%}")

微调服务部署

将微调模型投入生产

# 使用vLLM部署微调模型
from vllm import LLM, SamplingParams

# 加载合并后的模型
llm = LLM(
    model="./merged_model",
    tensor_parallel_size=1,
    dtype="half",  # FP16推理
    max_model_len=2048
)

# API服务
from fastapi import FastAPI
from pydantic import BaseModel

app = FastAPI()

class GenerationRequest(BaseModel):
    prompt: str
    max_tokens: int = 256
    temperature: float = 0.7
    top_p: float = 0.9

@app.post("/generate")
async def generate(request: GenerationRequest):
    sampling_params = SamplingParams(
        temperature=request.temperature,
        top_p=request.top_p,
        max_tokens=request.max_tokens
    )
    
    outputs = llm.generate([request.prompt], sampling_params)
    
    return {
        "generated_text": outputs[0].outputs[0].text,
        "usage": {
            "prompt_tokens": len(outputs[0].prompt_token_ids),
            "completion_tokens": len(outputs[0].outputs[0].token_ids)
        }
    }

# 启动服务
# uvicorn main:app --host 0.0.0.0 --port 8000

# Docker部署
dockerfile = """
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04

RUN pip install vllm fastapi uvicorn

COPY ./merged_model /model
COPY ./main.py /app/main.py

WORKDIR /app
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
"""

微调成本估算

不同规模模型的微调成本

模型规模GPU需求训练时间云服务成本
7B (LoRA)1 × A100 40GB2-4小时$10-20
13B (QLoRA)1 × A100 40GB4-8小时$20-40
30B (LoRA)2 × A100 80GB8-16小时$80-160
70B (QLoRA)4 × A100 80GB24-48小时$500-1000

* 基于1万条训练数据,3个epoch的估算

开始微调您的专属模型

通过微调技术,您可以让大模型更好地理解和处理您的专业领域任务。 结合LLM API,快速部署和使用您的微调模型。

立即开始微调