Skip to content

Latest commit

 

History

History
15 lines (12 loc) · 592 Bytes

README.md

File metadata and controls

15 lines (12 loc) · 592 Bytes

This repo contains code to run llama with bi-directional attention by passing is_causal=False to the forward function.

from modeling_llama import LlamaForCausalLM
from transformers import AutoTokenizer
import torch

name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(name)
tokenizer.pad_token = tokenizer.bos_token

model = LlamaForCausalLM.from_pretrained(name, device_map='auto', torch_dtype=torch.bfloat16)

inp = tokenizer(['this is a test example', 'test'], return_tensors='pt', padding=True).to('cuda')
out = model(**inp, is_causal=False)