Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Ricardokevins committed Mar 28, 2023
1 parent d111e5f commit 04341d9
Show file tree
Hide file tree
Showing 10 changed files with 648 additions and 9 deletions.
Binary file modified .DS_Store
Binary file not shown.
65 changes: 65 additions & 0 deletions LLM/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer
from transformers import GPTJForCausalLM
import json
import torch
import time
import os
from config import LLM_Config
from utils import load_model_tokenizer,return_log_writer
# CUDA_VISIBLE_DEVICES=3,5 deepspeed --num_gpus=2 run_LLM.py --do_eval --deepspeed ds_config.json
# deepspeed --include="localhost:3,5" run_LLM.py --do_eval --deepspeed ds_config.json
# deepspeed --include="localhost:0" run_LLM.py --do_eval --deepspeed ds_config.json

name = input("Model name: ")
if name not in LLM_Config:
print("Error")

model,tokenizer = load_model_tokenizer(name)
logger = return_log_writer(name)

while 1:
input_text = input("Prompt: ")
if len(input_text) == 0:
continue
if input_text[0].strip() == "":
continue
if input_text[0] == "+":#Special Prompt to Decode Files
f = open(input_text[1:]+'.json','r')
data = json.load(f)
#data = data[:10]
from tqdm import tqdm
for i in tqdm(data):
text = i['input']
inputs = tokenizer.encode(text, return_tensors="pt",max_length=2048,truncation=True)
inputs = inputs.cuda()
decode_length_constrain = list(inputs.shape)[1] + 150
# if list(inputs.shape)[1] > 600:
# decode_length_constrain = list(inputs.shape)[1] + 100
# else:
# decode_length_constrain = 1024
with torch.no_grad():
outputs = model.generate(inputs,max_length=decode_length_constrain)
#outputs = model.generate(inputs,max_length=1024)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
i['target'] = generated

f = open(input_text[1:]+'_decoded_'+ name + '.json','w')
json.dump(data,f,indent=4)
f.close()
else:
inputs = tokenizer.encode(input_text, return_tensors="pt",max_length=1024,truncation=True)
inputs = inputs.cuda()
with torch.no_grad():
outputs = model.generate(inputs,max_length=1024)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(name + ": ",generated)
data = {'input':input_text,'generated':generated}
data = json.dumps(data, indent=4)
logger.write(data+'\n')
logger.flush()
print("================================================================")


75 changes: 66 additions & 9 deletions QuerySearch/DialogueSearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,22 @@ def get_documents_score(self, query):
# print(" ".join(i))
return score_list

path = "/Users/sheshuaijie/Desktop/workspace/Data/Data/SAMSum/train.json"
path = "/Users/sheshuaijie/Desktop/RearchSpace/Data/Data/SAMSum/train.json"
import json
f = open(path,'r')
data = json.load(f)
f.close()

import string
punctuation_string = string.punctuation
def re_punctuation(stri):
for i in punctuation_string:
stri = stri.replace(i, ' ')
return stri

def return_biturn(turns):
biturns = []
for i in range(0,len(turns)-1,2):
for i in range(0,len(turns)-1):
biturns.append(turns[i]+turns[i+1])
return biturns

Expand All @@ -129,7 +136,12 @@ def return_biturn(turns):
dialogue = data[i]['dialogue']
turns = dialogue.split('\n')
turns = [i.strip()[len(i.split(":")[0])+2:] for i in turns]
turns = [re_punctuation(i) for i in turns]

turns = [i.split(" ") for i in turns]
turns = [[j for j in i if len(j)!=0] for i in turns]
# print(turns)
# exit()
all_tokens = []
biturn = return_biturn(turns)
Document_Features.extend(biturn)
Expand All @@ -143,7 +155,7 @@ def return_biturn(turns):
model2 = BM25_Model(Document_Features)

def getTopK(t):
k = 20
k = 30
max_index = []
for _ in range(k):
number = max(t)
Expand All @@ -153,7 +165,7 @@ def getTopK(t):
return max_index


f = open("/Users/sheshuaijie/Desktop/workspace/Data/Data/SAMSum/val.json",'r')
f = open("/Users/sheshuaijie/Desktop/RearchSpace/Data/Data/SAMSum/val.json",'r')
testdata = json.load(f)
dialogue = testdata[1]['dialogue']
turns = dialogue.split('\n')
Expand All @@ -177,18 +189,63 @@ def getTopK(t):
#input_query = "Do you want some? Sure".split(" ")
#input_query = "Have you got any homework for tomorrow? no dad".split(" ")
#input_query = "What did you plan on doing?".split(" ")
#input_query = "are you in Warsaw? yes, just back!".split(" ")
#nput_query = 'do you have Betty\'s number? Lemme check'.split(" ")
input_query = " It's good for us, Vanessa and I are still on our way and Peter's stuck in a traffic".split(" ")
score_list = model2.get_documents_score(input_query)
input_query = "are you in Warsaw? yes, just back!"
#input_query = 'do you have Betty\'s number? Lemme check'.split(" ")
#input_query = " It's good for us, Vanessa and I are still on our way and Peter's stuck in a traffic".split(" ")
#input_query = "can you take your dog away before i come?".split(" ")
input_query = "Yeah. I definitely prefer Lisbon Yeah me too"

input_query = re_punctuation(input_query)
input_query_token = input_query.split(" ")
input_query_token = [j for j in input_query_token if len(j)!=0]
print(input_query)
score_list = model2.get_documents_score(input_query_token)
#score_list = model2.get_documents_score(input_query)
pre_score_list = score_list.copy()
best = getTopK(score_list)


stop_words = []
f = open('/Users/sheshuaijie/Downloads/stop_words_english.txt','r')
lines = f.readlines()
stop_words = [i.strip() for i in lines]

onegram_freq = {}
twogram_freq = {}
for i in best:
#print(data[i]['summary'])
print(" ".join(Document_Features[i]))
print(pre_score_list[i])
TwoGram = []
for _ in range(len(Document_Features[i])-1):
twogram = Document_Features[i][_] + " " + Document_Features[i][_+1]
twogram_freq[twogram] = twogram_freq.get(twogram,0) + 1

for _ in range(len(Document_Features[i])):
#twogram = Document_Features[i][_] + " " + Document_Features[i][_+1]
onegram = Document_Features[i][_]
onegram_freq[onegram] = onegram_freq.get(onegram,0) + 1
#print(pre_score_list[i])


one_gram_sort_result = sorted(onegram_freq.items(), key = lambda kv:(kv[1], kv[0]),reverse=True)[:30]
two_gram_sort_result = sorted(twogram_freq.items(), key = lambda kv:(kv[1], kv[0]),reverse=True)[:30]

freq_one_gram = [i[0] for i in one_gram_sort_result]
pattern = []
for i in input_query_token:
if i in freq_one_gram:
pattern.append(i)
else:
pattern.append('[UNK]')


print("============================================")
print(input_query)
print(" ".join(pattern))
# for i,j in zip(one_gram_sort_result,two_gram_sort_result):
# # if i[0] in stop_words:
# # continue
# print(i,j)
# for i in tqdm(range(len(lines))):
# data_dict = json.loads(lines[i])
# score_list = model2.get_documents_score(data_dict['feature'])
Expand Down
135 changes: 135 additions & 0 deletions QuerySearch/cluster.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import pandas as pd
import codecs
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


if __name__ == "__main__":

#文档预料 空格连接
corpus = []


import re


path = "/Users/sheshuaijie/Desktop/RearchSpace/Data/Data/SAMSum/test.json"
import json
f = open(path,'r')
data = json.load(f)
f.close()

import string
punctuation_string = string.punctuation + '’'
def re_punctuation(stri):
for i in punctuation_string:
stri = stri.replace(i, ' ').lower()
return stri

def return_biturn(turns):
biturns = []
for i in range(0,len(turns)-1):
biturns.append(turns[i]+turns[i+1])

return biturns

Document_Features = []
for i in range(len(data)):
dialogue = data[i]['dialogue']
turns = dialogue.split('\n')
turns = [i.strip()[len(i.split(":")[0])+2:] for i in turns]
turns = [re_punctuation(i)for i in turns]
#print(turns)
turns = list(set(turns))
# print(turns)
# exit()
turns = [i.split(" ") for i in turns]
turns = [[j for j in i if len(j)!=0] for i in turns]
turns = [" ".join(i) for i in turns]

all_tokens = []
#biturn = return_biturn(turns)
#Document_Features.extend(biturn)


corpus.extend(turns)


#将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer(min_df=5)

#该类会统计每个词语的tf-idf权值
transformer = TfidfTransformer()

#第一个fit_transform是计算tf-idf 第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))

#获取词袋模型中的所有词语
word = vectorizer.get_feature_names()

#将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
weight = tfidf.toarray()


#打印特征向量文本内容
# resName = "Tfidf_Result.txt"
# result = codecs.open(resName, 'w', 'utf-8')
# for j in range(len(word)):
# result.write(word[j] + ' ')
# result.write('\r\n\r\n')

# #每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
# for i in range(len(weight)):
# for j in range(len(word)):
# result.write(str(weight[i][j]) + ' ')
# result.write('\r\n\r\n')
# result.close()

print( 'Start Kmeans:')
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=100) #科技 医学 汽车 国家
s = clf.fit(weight)
print( 'Start Kmeans:')
#每个样本所属的簇
label = []
i = 1
while i <= len(clf.labels_):
label.append(clf.labels_[i-1])
i = i + 1

y_pred = clf.labels_

label2text = {}
for i,j in zip(y_pred,corpus):
if i in label2text:
label2text[i].append(j)
else:
label2text[i] = [j]

for i in label2text:
f = open("cluster/"+str(i)+".txt",'w')
for j in label2text[i]:
f.write(j+"\n")
f.close()

# from sklearn.decomposition import PCA
# pca = PCA(n_components=2) #输出两维
# newData = pca.fit_transform(weight) #载入N维

# xs, ys = newData[:, 0], newData[:, 1]
# #设置颜色
# cluster_colors = {0: 'r', 1: 'yellow', 2: 'b', 3: 'chartreuse', 4: 'purple', 5: '#FFC0CB', 6: '#6A5ACD', 7: '#98FB98'}

# #设置类名
# cluster_names = {0: u'类0', 1: u'类1',2: u'类2',3: u'类3',4: u'类4',5: u'类5',6: u'类6',7: u'类7'}

# df = pd.DataFrame(dict(x=xs, y=ys, label=y_pred, title=corpus))
# groups = df.groupby('label')

# fig, ax = plt.subplots(figsize=(8, 5)) # set size
# ax.margins(0.02)
# for name, group in groups:
# ax.plot(group.x, group.y, marker='o', linestyle='', ms=10, label=cluster_names[name], color=cluster_colors[name], mec='none')

# plt.show()
Binary file added QuerySearch/data.dict
Binary file not shown.
Binary file added QuerySearch/data.trained
Binary file not shown.
Loading

0 comments on commit 04341d9

Please sign in to comment.