-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconfigs.py
93 lines (76 loc) · 7.09 KB
/
configs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from typing import Optional, List
import os, sys
from pathlib import Path
from dataclasses import dataclass, field
from .constant import RUNS_DIR, DATASETS_DIR
import transformers
here = Path(__file__).parent
@dataclass
class BaseArgs:
## Initialize
model_path: str = field(default="xiwu/xiwu-13b-20230509", metadata={"help": "The path to the weights. This can be a local folder or a Hugging Face repo ID."})
model_name: Optional[str] = field(default=None, metadata={"help": "The model name, will be contained in the output, if None, the model path will be used."})
device: str = field(default="cuda", metadata={"help": "The device type", "choices": ["cpu", "cuda", "mps", "xpu", "npu"]})
gpus: Optional[str] = field(default=None, metadata={"help": "A single GPU like 1 or multiple GPUs like 0,2"})
num_gpus: int = field(default=1, metadata={"help": "Number of GPUs to use"})
max_gpu_memory: Optional[str] = field(default=None, metadata={"help": "The maximum memory per GPU for storing model weights. Use a string like '13Gib'"})
load_8bit: bool = field(default=False, metadata={"help": "Use 8-bit quantization"})
dtype: Optional[str] = field(default=None, metadata={"help": "Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.", "choices": ["float32", "float16", "bfloat16"]})
lazy_loading: bool = field(default=True, metadata={"help": "Load the model and tokenizer lazily"})
revision: str = field(default="main", metadata={"help": "Hugging Face Hub model revision identifier"})
## Run-time
cpu_offloading: bool = field(default=False, metadata={"help": "Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU"})
conv_template: Optional[str] = field(default=None, metadata={"help": "Conversation prompt template."})
conv_system_msg: Optional[str] = field(default=None, metadata={"help": "Conversation system message."})
no_history: bool = field(default=False, metadata={"help": "Do not keep history of conversations."})
# Generate Params
temperature: float = field(default=0.7, metadata={"help": "Temperature for text generation."})
top_p: float = field(default=1.0, metadata={"help": "Top-p for text generation."})
top_k: int = field(default=-1, metadata={"help": "Top-k for text generation."})
repetition_penalty: float = field(default=1.0, metadata={"help": "Repetition penalty for text generation."})
max_new_tokens: int = field(default=512, metadata={"help": "Maximum number of new tokens to generate."})
logprobs: int = field(default=None, metadata={"help": "Include log probabilities in the output."})
echo: bool = field(default=True, metadata={"help": "Echo the input text."})
stop_str: str = field(default=None, metadata={"help": "Stop generation at this string. can be single string or a list of strings."})
# stop_token_ids: Optional[list] = field(default=None, metadata={"help": "Stop generation at this token ID, default is from Tokenizer"})
judge_sent_end: bool = field(default=False, metadata={"help": "Whether enable the correction logic that interrupts the output of sentences due to EOS."})
stream: bool = field(default=False, metadata={"help": "Enable streaming output."})
oai_format: bool = field(default=True, metadata={"help": "Enable OAI format for streaming output."})
# For CLI
style: str = field(default="simple", metadata={"help": "Display style.", "choices": ["simple", "rich", "programmatic"]})
multiline: bool = field(default=False, metadata={"help": "Enable multiline input. Use ESC+Enter for newline."})
mouse: bool = field(default=False, metadata={"help": "[Rich Style]: Enable mouse support for cursor positioning."})
debug: bool = field(default=False, metadata={"help": "Print useful debug information (e.g., prompts)"})
# Quantization
gptq_ckpt: Optional[str] = field(default=None, metadata={"help": "Used for GPTQ. The path to the local GPTQ checkpoint."})
gptq_wbits: int = field(default=16, metadata={"help": "Used for GPTQ. #bits to use for quantization", "choices": [2, 3, 4, 8, 16]})
gptq_groupsize: int = field(default=-1, metadata={"help": "Used for GPTQ. Groupsize to use for quantization; default uses full row."})
gptq_act_order: bool = field(default=False, metadata={"help": "Used for GPTQ. Whether to apply the activation order GPTQ heuristic"})
awq_ckpt: Optional[str] = field(default=None, metadata={"help": "Used for AWQ. Load quantized model. The path to the local AWQ checkpoint."})
awq_wbits: int = field(default=16, metadata={"help": "Used for AWQ. #bits to use for AWQ quantization", "choices": [4, 16]})
awq_groupsize: int = field(default=-1, metadata={"help": "Used for AWQ. Groupsize to use for AWQ quantization; default uses full row."})
enable_exllama: bool = field(default=False, metadata={"help": "Used for exllamabv2. Enable exllamaV2 inference framework."})
exllama_max_seq_len: int = field(default=4096, metadata={"help": "Used for exllamabv2. Max sequence length to use for exllamav2 framework; default 4096 sequence length."})
exllama_gpu_split: Optional[str] = field(default=None, metadata={"help": "Used for exllamabv2. Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7"})
exllama_cache_8bit: bool = field(default=False, metadata={"help": "Used for exllamabv2. Use 8-bit cache to save VRAM."})
enable_xft: bool = field(default=False, metadata={"help": "Used for xFasterTransformer. Enable xFasterTransformer inference framework."})
xft_max_seq_len: int = field(default=4096, metadata={"help": "Used for xFasterTransformer. Max sequence length to use for xFasterTransformer framework; default 4096 sequence length."})
xft_dtype: Optional[str] = field(default=None, metadata={"help": "Override the default dtype. If not set, it will use bfloat16 for first token and float16 next tokens on CPU.", "choices": ["fp16", "bf16", "int8", "bf16_fp16", "bf16_int8"]})
@dataclass
class ModelArgs:
model_name_or_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5-16k")
trust_remote_code: bool = field(default=False, metadata={"help": "Whether or not to allow for custom models defined on the Hub in their own modeling files"},)
padding_side: str = field(default="right", metadata={"help": "The padding side in tokenizer"})
@dataclass
class DataArgs:
data_path: str = field(default=f'{DATASETS_DIR}/hep_text_v1.0', metadata={"help": "Path to the training data."})
eval_data_path: str = field(default=None, metadata={"help": "Path to the evaluation data."})
lazy_preprocess: bool = True
@dataclass
class TrainingArgs(transformers.TrainingArguments):
cache_dir: Optional[str] = field(default=None)
optim: str = field(default="adamw_torch")
model_max_length: int = field(default=576, metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},)
output_dir: str = field(default=RUNS_DIR, metadata={"help": "The output directory where the model predictions and checkpoints will be written."})
# fsdp: str = "full_shard auto_wrap offload"
# fsdp_config: str = f"{here}/fsdp_config.json"