Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
Ricardokevins committed Jan 3, 2023
1 parent 503ed33 commit d111e5f
Show file tree
Hide file tree
Showing 7 changed files with 996 additions and 0 deletions.
206 changes: 206 additions & 0 deletions QuerySearch/DialogueSearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
#path = 'train.target'

# Follow implement in https://blog.csdn.net/chaojianmo/article/details/105143657

# path = 'train.source'
from tqdm import tqdm
import numpy as np
class TF_IDF_Model(object):
def __init__(self, documents_list):
self.documents_list = documents_list
self.documents_number = len(documents_list)
self.tf = []
self.idf = {}
self.init()

def init(self):
df = {}
for document in tqdm(self.documents_list):
temp = {}
for word in document:
temp[word] = temp.get(word, 0) + 1/len(document)
self.tf.append(temp)
for key in temp.keys():
df[key] = df.get(key, 0) + 1
for key, value in df.items():
self.idf[key] = np.log(self.documents_number / (value + 1))

def get_score(self, index, query):
score = 0.0
for q in query:
if q not in self.tf[index]:
continue
score += self.tf[index][q] * self.idf[q]
return score

def get_documents_score(self, query):
score_list = []
best_score = -1
best_result = 0
result_list = []
for i in tqdm(range(self.documents_number)):
cur_score = self.get_score(i, query)
score_list.append(cur_score)
if best_score < cur_score:
best_score = cur_score
# best_result = i
#print(best_score)
#print(self.documents_list[i])
result_list.append(self.documents_list[i])
for i in result_list[-5:]:
print(" ".join(i))
#print(result_list[:3])
return score_list


import numpy as np
from collections import Counter


class BM25_Model(object):
def __init__(self, documents_list, k1=2, k2=1, b=0.5):
self.documents_list = documents_list
self.documents_number = len(documents_list)
self.avg_documents_len = sum([len(document) for document in documents_list]) / self.documents_number
self.f = []
self.idf = {}
self.k1 = k1
self.k2 = k2
self.b = b
self.init()

def init(self):
df = {}
for document in tqdm(self.documents_list):
temp = {}
for word in document:
temp[word] = temp.get(word, 0) + 1
self.f.append(temp)
for key in temp.keys():
df[key] = df.get(key, 0) + 1
for key, value in df.items():
self.idf[key] = np.log((self.documents_number - value + 0.5) / (value + 0.5))

def get_score(self, index, query):
score = 0.0
document_len = len(self.f[index])
qf = Counter(query)
for q in query:
if q not in self.f[index]:
continue
score += self.idf[q] * (self.f[index][q] * (self.k1 + 1) / (
self.f[index][q] + self.k1 * (1 - self.b + self.b * document_len / self.avg_documents_len))) * (
qf[q] * (self.k2 + 1) / (qf[q] + self.k2))

return score

def get_documents_score(self, query):
score_list = []
best_score = -1
best_result = 0
result_list = []
for i in range(self.documents_number):
cur_score = self.get_score(i, query)
score_list.append(cur_score)
# if best_score < cur_score:
# best_score = cur_score
# # best_result = i
# #print(best_score)
# #print(self.documents_list[i])
# result_list.append(self.documents_list[i])
# for i in result_list[-3:]:
# print(" ".join(i))
return score_list

path = "/Users/sheshuaijie/Desktop/workspace/Data/Data/SAMSum/train.json"
import json
f = open(path,'r')
data = json.load(f)
f.close()

def return_biturn(turns):
biturns = []
for i in range(0,len(turns)-1,2):
biturns.append(turns[i]+turns[i+1])
return biturns

Document_Features = []
for i in range(len(data)):
dialogue = data[i]['dialogue']
turns = dialogue.split('\n')
turns = [i.strip()[len(i.split(":")[0])+2:] for i in turns]
turns = [i.split(" ") for i in turns]
all_tokens = []
biturn = return_biturn(turns)
Document_Features.extend(biturn)
# for t in turns:
# all_tokens.extend(t)
# Document_Features.append(t)
# print(all_tokens)
# exit()
#Document_Features.append(all_tokens)
model = TF_IDF_Model(Document_Features)
model2 = BM25_Model(Document_Features)

def getTopK(t):
k = 20
max_index = []
for _ in range(k):
number = max(t)
index = t.index(number)
t[index] = 0
max_index.append(index)
return max_index


f = open("/Users/sheshuaijie/Desktop/workspace/Data/Data/SAMSum/val.json",'r')
testdata = json.load(f)
dialogue = testdata[1]['dialogue']
turns = dialogue.split('\n')
turns = [i.strip()[len(i.split(":")[0])+2:] for i in turns]
turns = [i.split(" ") for i in turns]
input_query = []
# for i in range(len(turns)):
# input_query = turns[i]
# # for t in turns:
# # input_query.extend(t)

# print(" ".join(input_query))
# print("=====================================================")
# score_list = model2.get_documents_score(input_query)
# pre_score_list = score_list.copy()
# best = getTopK(score_list)
# for i in best:
# #print(data[i]['summary'])
# print(" ".join(Document_Features[i]))
# print(pre_score_list[i])
#input_query = "Do you want some? Sure".split(" ")
#input_query = "Have you got any homework for tomorrow? no dad".split(" ")
#input_query = "What did you plan on doing?".split(" ")
#input_query = "are you in Warsaw? yes, just back!".split(" ")
#nput_query = 'do you have Betty\'s number? Lemme check'.split(" ")
input_query = " It's good for us, Vanessa and I are still on our way and Peter's stuck in a traffic".split(" ")
score_list = model2.get_documents_score(input_query)
pre_score_list = score_list.copy()
best = getTopK(score_list)
for i in best:
#print(data[i]['summary'])
print(" ".join(Document_Features[i]))
print(pre_score_list[i])


# for i in tqdm(range(len(lines))):
# data_dict = json.loads(lines[i])
# score_list = model2.get_documents_score(data_dict['feature'])

# score_list[i] = -1
# best = getTopK(score_list)
# best = [str(i) for i in best]
# fout.write(" ".join(best) + '\n')

# input_query = lines[-1]
# model1 = TF_IDF_Model(lines)
# model1.get_documents_score(input_query)

# model2 = BM25_Model(lines)
# model2.get_documents_score(input_query)
68 changes: 68 additions & 0 deletions utils/build_vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import numpy as np
import torch
import torch.nn as nn
from transformers import BartTokenizer, BartForConditionalGeneration
# Load pre-trained model (weights)
device = torch.device("cpu")
model = BartForConditionalGeneration.from_pretrained('/Users/sheshuaijie/Desktop/RearchSpace/Data/PLM/linydub-bart-large-samsum',output_hidden_states = True,output_attentions=True)
model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BartTokenizer.from_pretrained('/Users/sheshuaijie/Desktop/RearchSpace/Data/PLM/linydub-bart-large-samsum')

Feature = []
Score_distribution = []
tokens = []
with torch.no_grad():
path = "/Users/sheshuaijie/Desktop/workspace/Data/Data/SAMSum/train.json"
import json
f = open(path,'r')
data = json.load(f)
f.close()
from tqdm import tqdm
for i in tqdm(data):
source = i['dialogue']
target = i['summary']
src_encoded = tokenizer(
[source],
max_length=1024,
truncation=True,
padding=True,
return_tensors='pt'
)
src_tokens = src_encoded['input_ids'].to(device)
src_attn_mask = src_encoded['attention_mask'].to(device)

tgt_encoded = tokenizer(
[target],
max_length=1024,
truncation=True,
padding=True,
return_tensors='pt'
)
tgt_tokens = tgt_encoded['input_ids'].to(device)
tgt_attn_mask = tgt_encoded['attention_mask'].to(device)

return_state = model(
input_ids=src_tokens,
attention_mask=src_attn_mask,
labels=tgt_tokens
)
logits = return_state['logits'].view(-1, model.model.config.vocab_size)
decoder_state = return_state.decoder_hidden_states
Feature.append(decoder_state[-1].reshape(-1,1024))
Score_distribution.append(logits)
tokens.append(tgt_tokens.reshape(-1))
# print(len(decoder_state))
# print(logits.shape)
# print(decoder_state[-1].shape)


feature = torch.cat(Feature, 0)
score = torch.cat(Score_distribution, 0)
tokens = torch.cat(tokens, 0)
print(feature.shape)
print(score.shape)
print(tokens.shape)



27 changes: 27 additions & 0 deletions utils/faiss_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import numpy as np

d = 64 # 向量维度
nb = 100000 # 向量集大小
nq = 10000 # 查询次数
np.random.seed(1234) # 随机种子,使结果可复现
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

import faiss

nlist = 100
m = 8
k = 4
quantizer = faiss.IndexFlatL2(d) # 内部的索引方式依然不变
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
# 每个向量都被编码为8个字节大小
index.train(xb)
index.add(xb)
D, I = index.search(xb[:5], k) # 测试
print(I)
print(D)
# index.nprobe = 10 # 与以前的方法相比
# D, I = index.search(xq, k) # 检索
# print(I[-5:])
Loading

0 comments on commit d111e5f

Please sign in to comment.