-
Notifications
You must be signed in to change notification settings - Fork 8
/
infer.py
46 lines (31 loc) · 1.08 KB
/
infer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
AutoTokenizer,
)
device = "auto"
model_path = "outputs/merged_model" # Path to the combined weights
# Prompt should be in this style due to how the data was created
prompt = "#### Human: What is the capital of Australia?#### Assistant:"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype="float16",
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
device_map=device,
# load_in_8bit=True,
quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
inputs = tokenizer(prompt, return_tensors="pt")
if device != "cpu":
inputs = inputs.to('cuda')
del inputs['token_type_ids']
output = model.generate(**inputs, do_sample=True, top_p=0.95, top_k=60, max_new_tokens=100)
output = tokenizer.decode(output[0], skip_special_tokens=True)
print(output.split("#### Assistant: ")[1])