diff --git a/QuerySearch/DialogueSearch.py b/QuerySearch/DialogueSearch.py new file mode 100644 index 00000000..1bb4d00e --- /dev/null +++ b/QuerySearch/DialogueSearch.py @@ -0,0 +1,206 @@ +#path = 'train.target' + +# Follow implement in https://blog.csdn.net/chaojianmo/article/details/105143657 + +# path = 'train.source' +from tqdm import tqdm +import numpy as np +class TF_IDF_Model(object): + def __init__(self, documents_list): + self.documents_list = documents_list + self.documents_number = len(documents_list) + self.tf = [] + self.idf = {} + self.init() + + def init(self): + df = {} + for document in tqdm(self.documents_list): + temp = {} + for word in document: + temp[word] = temp.get(word, 0) + 1/len(document) + self.tf.append(temp) + for key in temp.keys(): + df[key] = df.get(key, 0) + 1 + for key, value in df.items(): + self.idf[key] = np.log(self.documents_number / (value + 1)) + + def get_score(self, index, query): + score = 0.0 + for q in query: + if q not in self.tf[index]: + continue + score += self.tf[index][q] * self.idf[q] + return score + + def get_documents_score(self, query): + score_list = [] + best_score = -1 + best_result = 0 + result_list = [] + for i in tqdm(range(self.documents_number)): + cur_score = self.get_score(i, query) + score_list.append(cur_score) + if best_score < cur_score: + best_score = cur_score + # best_result = i + #print(best_score) + #print(self.documents_list[i]) + result_list.append(self.documents_list[i]) + for i in result_list[-5:]: + print(" ".join(i)) + #print(result_list[:3]) + return score_list + + +import numpy as np +from collections import Counter + + +class BM25_Model(object): + def __init__(self, documents_list, k1=2, k2=1, b=0.5): + self.documents_list = documents_list + self.documents_number = len(documents_list) + self.avg_documents_len = sum([len(document) for document in documents_list]) / self.documents_number + self.f = [] + self.idf = {} + self.k1 = k1 + self.k2 = k2 + self.b = b + self.init() + + def init(self): + df = {} + for document in tqdm(self.documents_list): + temp = {} + for word in document: + temp[word] = temp.get(word, 0) + 1 + self.f.append(temp) + for key in temp.keys(): + df[key] = df.get(key, 0) + 1 + for key, value in df.items(): + self.idf[key] = np.log((self.documents_number - value + 0.5) / (value + 0.5)) + + def get_score(self, index, query): + score = 0.0 + document_len = len(self.f[index]) + qf = Counter(query) + for q in query: + if q not in self.f[index]: + continue + score += self.idf[q] * (self.f[index][q] * (self.k1 + 1) / ( + self.f[index][q] + self.k1 * (1 - self.b + self.b * document_len / self.avg_documents_len))) * ( + qf[q] * (self.k2 + 1) / (qf[q] + self.k2)) + + return score + + def get_documents_score(self, query): + score_list = [] + best_score = -1 + best_result = 0 + result_list = [] + for i in range(self.documents_number): + cur_score = self.get_score(i, query) + score_list.append(cur_score) + # if best_score < cur_score: + # best_score = cur_score + # # best_result = i + # #print(best_score) + # #print(self.documents_list[i]) + # result_list.append(self.documents_list[i]) + # for i in result_list[-3:]: + # print(" ".join(i)) + return score_list + +path = "/Users/sheshuaijie/Desktop/workspace/Data/Data/SAMSum/train.json" +import json +f = open(path,'r') +data = json.load(f) +f.close() + +def return_biturn(turns): + biturns = [] + for i in range(0,len(turns)-1,2): + biturns.append(turns[i]+turns[i+1]) + return biturns + +Document_Features = [] +for i in range(len(data)): + dialogue = data[i]['dialogue'] + turns = dialogue.split('\n') + turns = [i.strip()[len(i.split(":")[0])+2:] for i in turns] + turns = [i.split(" ") for i in turns] + all_tokens = [] + biturn = return_biturn(turns) + Document_Features.extend(biturn) + # for t in turns: + # all_tokens.extend(t) + # Document_Features.append(t) + # print(all_tokens) + # exit() + #Document_Features.append(all_tokens) +model = TF_IDF_Model(Document_Features) +model2 = BM25_Model(Document_Features) + +def getTopK(t): + k = 20 + max_index = [] + for _ in range(k): + number = max(t) + index = t.index(number) + t[index] = 0 + max_index.append(index) + return max_index + + +f = open("/Users/sheshuaijie/Desktop/workspace/Data/Data/SAMSum/val.json",'r') +testdata = json.load(f) +dialogue = testdata[1]['dialogue'] +turns = dialogue.split('\n') +turns = [i.strip()[len(i.split(":")[0])+2:] for i in turns] +turns = [i.split(" ") for i in turns] +input_query = [] +# for i in range(len(turns)): +# input_query = turns[i] +# # for t in turns: +# # input_query.extend(t) + +# print(" ".join(input_query)) +# print("=====================================================") +# score_list = model2.get_documents_score(input_query) +# pre_score_list = score_list.copy() +# best = getTopK(score_list) +# for i in best: +# #print(data[i]['summary']) +# print(" ".join(Document_Features[i])) +# print(pre_score_list[i]) +#input_query = "Do you want some? Sure".split(" ") +#input_query = "Have you got any homework for tomorrow? no dad".split(" ") +#input_query = "What did you plan on doing?".split(" ") +#input_query = "are you in Warsaw? yes, just back!".split(" ") +#nput_query = 'do you have Betty\'s number? Lemme check'.split(" ") +input_query = " It's good for us, Vanessa and I are still on our way and Peter's stuck in a traffic".split(" ") +score_list = model2.get_documents_score(input_query) +pre_score_list = score_list.copy() +best = getTopK(score_list) +for i in best: + #print(data[i]['summary']) + print(" ".join(Document_Features[i])) + print(pre_score_list[i]) + + +# for i in tqdm(range(len(lines))): +# data_dict = json.loads(lines[i]) +# score_list = model2.get_documents_score(data_dict['feature']) + +# score_list[i] = -1 +# best = getTopK(score_list) +# best = [str(i) for i in best] +# fout.write(" ".join(best) + '\n') + +# input_query = lines[-1] +# model1 = TF_IDF_Model(lines) +# model1.get_documents_score(input_query) + +# model2 = BM25_Model(lines) +# model2.get_documents_score(input_query) \ No newline at end of file diff --git a/utils/build_vec.py b/utils/build_vec.py new file mode 100644 index 00000000..ab0d5fc1 --- /dev/null +++ b/utils/build_vec.py @@ -0,0 +1,68 @@ +import numpy as np +import torch +import torch.nn as nn +from transformers import BartTokenizer, BartForConditionalGeneration +# Load pre-trained model (weights) +device = torch.device("cpu") +model = BartForConditionalGeneration.from_pretrained('/Users/sheshuaijie/Desktop/RearchSpace/Data/PLM/linydub-bart-large-samsum',output_hidden_states = True,output_attentions=True) +model.eval() +# Load pre-trained model tokenizer (vocabulary) +tokenizer = BartTokenizer.from_pretrained('/Users/sheshuaijie/Desktop/RearchSpace/Data/PLM/linydub-bart-large-samsum') + +Feature = [] +Score_distribution = [] +tokens = [] +with torch.no_grad(): + path = "/Users/sheshuaijie/Desktop/workspace/Data/Data/SAMSum/train.json" + import json + f = open(path,'r') + data = json.load(f) + f.close() + from tqdm import tqdm + for i in tqdm(data): + source = i['dialogue'] + target = i['summary'] + src_encoded = tokenizer( + [source], + max_length=1024, + truncation=True, + padding=True, + return_tensors='pt' + ) + src_tokens = src_encoded['input_ids'].to(device) + src_attn_mask = src_encoded['attention_mask'].to(device) + + tgt_encoded = tokenizer( + [target], + max_length=1024, + truncation=True, + padding=True, + return_tensors='pt' + ) + tgt_tokens = tgt_encoded['input_ids'].to(device) + tgt_attn_mask = tgt_encoded['attention_mask'].to(device) + + return_state = model( + input_ids=src_tokens, + attention_mask=src_attn_mask, + labels=tgt_tokens + ) + logits = return_state['logits'].view(-1, model.model.config.vocab_size) + decoder_state = return_state.decoder_hidden_states + Feature.append(decoder_state[-1].reshape(-1,1024)) + Score_distribution.append(logits) + tokens.append(tgt_tokens.reshape(-1)) + # print(len(decoder_state)) + # print(logits.shape) + # print(decoder_state[-1].shape) + + +feature = torch.cat(Feature, 0) +score = torch.cat(Score_distribution, 0) +tokens = torch.cat(tokens, 0) +print(feature.shape) +print(score.shape) +print(tokens.shape) + + + diff --git a/utils/faiss_demo.py b/utils/faiss_demo.py new file mode 100644 index 00000000..5ded02ce --- /dev/null +++ b/utils/faiss_demo.py @@ -0,0 +1,27 @@ +import numpy as np + +d = 64 # 向量维度 +nb = 100000 # 向量集大小 +nq = 10000 # 查询次数 +np.random.seed(1234) # 随机种子,使结果可复现 +xb = np.random.random((nb, d)).astype('float32') +xb[:, 0] += np.arange(nb) / 1000. +xq = np.random.random((nq, d)).astype('float32') +xq[:, 0] += np.arange(nq) / 1000. + +import faiss + +nlist = 100 +m = 8 +k = 4 +quantizer = faiss.IndexFlatL2(d) # 内部的索引方式依然不变 +index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8) + # 每个向量都被编码为8个字节大小 +index.train(xb) +index.add(xb) +D, I = index.search(xb[:5], k) # 测试 +print(I) +print(D) +# index.nprobe = 10 # 与以前的方法相比 +# D, I = index.search(xq, k) # 检索 +# print(I[-5:]) \ No newline at end of file diff --git a/utils/generation.py b/utils/generation.py new file mode 100644 index 00000000..d3049431 --- /dev/null +++ b/utils/generation.py @@ -0,0 +1,180 @@ +from transformers import ( + AutoTokenizer, + AutoModelForSeq2SeqLM, + LogitsProcessorList, + MinLengthLogitsProcessor, + StoppingCriteriaList, + MaxLengthCriteria, +) + +tokenizer = AutoTokenizer.from_pretrained("/Users/sheshuaijie/Desktop/RearchSpace/Data/PLM/linydub-bart-large-samsum") +model = AutoModelForSeq2SeqLM.from_pretrained("/Users/sheshuaijie/Desktop/RearchSpace/Data/PLM/linydub-bart-large-samsum") +input_prompt = "Aude: Hi Susie, how is Ted this morning? Did you find plasters?\nSusie: yes. He kept them till this morning after his shower.\nAude: he must look sexy whith them... lol\nSusie: a memory from Poland!" +inputs = tokenizer([input_prompt], max_length=1024, return_tensors="pt") +summary_ids = model.generate(inputs["input_ids"], num_beams=1, min_length=0, max_length=30) +print(tokenizer.batch_decode(summary_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]) +exit() +input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids + +# instantiate logits processors +logits_processor = LogitsProcessorList( + [ + MinLengthLogitsProcessor(10, eos_token_id=tokenizer.eos_token_id), + ] +) +stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) + +outputs = model.greedy_search( + input_ids = input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria +) + +print(tokenizer.batch_decode(outputs, skip_special_tokens=False)) + + +def _update_model_kwargs_for_generation( + outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False +) -> Dict[str, Any]: + # update past + if "past_key_values" in outputs: + model_kwargs["past"] = outputs.past_key_values + elif "mems" in outputs: + model_kwargs["past"] = outputs.mems + elif "past_buckets_states" in outputs: + model_kwargs["past"] = outputs.past_buckets_states + else: + model_kwargs["past"] = None + + # update token_type_ids with last value + if "token_type_ids" in model_kwargs: + token_type_ids = model_kwargs["token_type_ids"] + model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1) + + # update attention mask + if not is_encoder_decoder: + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + return model_kwargs + +def greedy_search( + self, + input_ids: torch.LongTensor, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: Optional[bool] = False, + **model_kwargs, + ) -> Union[GreedySearchOutput, torch.LongTensor]: + + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + warnings.warn( + "`max_length` is deprecated in this function, use" + " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.", + UserWarning, + ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + # keep track of which sequences are already finished + unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + cur_len = input_ids.shape[-1] + + this_peer_finished = False # used by synced_gpus only + while True: + + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + break + + # prepare model inputs + model_inputs = prepare_inputs_for_generation(input_ids, **model_kwargs) + # for name in model_inputs: + # print(name) + # print(model_inputs) + # exit() + + print(model_inputs['decoder_input_ids']) + #print(model_inputs) + #exit() + # forward pass to get next token + + outputs = model( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + #BOOKMARK + + + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_tokens_scores = logits_processor(input_ids, next_token_logits) + + next_tokens = torch.argmax(next_tokens_scores, dim=-1) + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + if pad_token_id is None: + raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.") + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + # model_kwargs = _update_model_kwargs_for_generation( + # outputs, model_kwargs, is_encoder_decoder=True + # ) + cur_len = cur_len + 1 + + # if eos_token was found in one sentence, set sentence to finished + # if eos_token_id is not None: + # unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long()) + + # # stop when each sentence is finished, or if we exceed the maximum length + # if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): + # if not synced_gpus: + # break + # else: + # this_peer_finished = True + + return input_ids \ No newline at end of file diff --git a/utils/knnsum.py b/utils/knnsum.py new file mode 100644 index 00000000..05475178 --- /dev/null +++ b/utils/knnsum.py @@ -0,0 +1,183 @@ +import numpy as np +import torch +import torch.nn as nn +import warnings + +warnings.filterwarnings('ignore') +V_set = torch.load('feature.pt') +score_set = torch.load('score.pt') +token_set = torch.load('tokens.pt') + +# d = 64 # 向量维度 +# nb = 100000 # 向量集大小 +# nq = 10000 # 查询次数 +# np.random.seed(1234) # 随机种子,使结果可复现 +# xb = np.random.random((nb, d)).astype('float32') +# xb[:, 0] += np.arange(nb) / 1000. +# xq = np.random.random((nq, d)).astype('float32') +# xq[:, 0] += np.arange(nq) / 1000. +# /home/data_ti4_d/wangjl/miniconda3/pkgs/libstdcxx-ng-11.2.0-h1234567_1/lib/libstdc++.so.6.0.29 +import faiss + +d = 1024 +nlist = 100 +m = 64 +k = 10 +lambda_weight = 0.6 +quantizer = faiss.IndexFlatL2(d) # 内部的索引方式依然不变 +index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8) +index.train(V_set) +index.add(V_set) + + +import numpy as np +import torch +import torch.nn as nn +from transformers import BartTokenizer, BartForConditionalGeneration +# Load pre-trained model (weights) +device = torch.device("cpu") +pretrain_path = '/home/shesj/workspace/Data/PLM/linydub-bart-large-samsum' +model = BartForConditionalGeneration.from_pretrained(pretrain_path,output_hidden_states = True,output_attentions=True) +model.eval() +tokenizer = BartTokenizer.from_pretrained(pretrain_path) + + + +from transformers import ( + AutoTokenizer, + AutoModelForSeq2SeqLM, + LogitsProcessorList, + MinLengthLogitsProcessor, + StoppingCriteriaList, + MaxLengthCriteria,LogitsProcessorList,MinLengthLogitsProcessor,NoRepeatNGramLogitsProcessor,ForcedBOSTokenLogitsProcessor +) +import torch +#device = "cuda" if torch.cuda.is_available() else "cpu" +device = 'cpu' + +import numpy as np +import torch +import torch.nn as nn +from transformers import BartTokenizer, BartForConditionalGeneration + +logits_processor = LogitsProcessorList([NoRepeatNGramLogitsProcessor(3),ForcedBOSTokenLogitsProcessor(2)]) + + +def greedy_search(input_prompt): + eos_token_id = model.config.eos_token_id + decode_length = 100 + decoded_ids = torch.tensor([[2]]).to(device) + for t in range(decode_length): + encoded_src = tokenizer( + [input_prompt], + max_length=1024, + truncation=True, + padding=True, + return_tensors='pt' + ) + #print(source) + + src_tokens = encoded_src['input_ids'].to(device) + src_mask = encoded_src['attention_mask'].to(device) + + + output = model( + input_ids=src_tokens, + attention_mask=src_mask, + decoder_input_ids=decoded_ids + ) + #logits = output.logits.view(-1, model.config.vocab_size) + next_token_logits = output.logits[:, -1, :] + + + next_tokens_scores = logits_processor(decoded_ids, next_token_logits) + next_tokens = torch.argmax(next_tokens_scores, dim=-1) + decoded_ids = torch.cat([decoded_ids, next_tokens[:, None]], dim=-1) + #print(next_tokens,eos_token_id) + if next_tokens == eos_token_id and t>3: + break + #print(next_tokens) + + return decoded_ids + +def greedy_search_withKNN(input_prompt): + eos_token_id = model.config.eos_token_id + decode_length = 100 + decoded_ids = torch.tensor([[2]]).to(device) + for t in range(decode_length): + encoded_src = tokenizer( + [input_prompt], + max_length=1024, + truncation=True, + padding=True, + return_tensors='pt' + ) + #print(source) + + src_tokens = encoded_src['input_ids'].to(device) + src_mask = encoded_src['attention_mask'].to(device) + + + output = model( + input_ids=src_tokens, + attention_mask=src_mask, + decoder_input_ids=decoded_ids + ) + #logits = output.logits.view(-1, model.config.vocab_size) + decoder_state = output.decoder_hidden_states[-1] + #print(decoder_state.shape) + D, I = index.search(decoder_state.reshape(-1,1024), k) + sft = nn.Softmax() + D = sft(-torch.tensor(D)).reshape(-1) + I = torch.tensor(I).reshape(-1) + next_token_logits = output.logits[:, -1, :] * lambda_weight + for sam_index in range(k): + + next_token_logits += score_set[I[sam_index]] * D[sam_index] * (1-lambda_weight) + + next_tokens_scores = logits_processor(decoded_ids, next_token_logits) + next_tokens = torch.argmax(next_tokens_scores, dim=-1) + decoded_ids = torch.cat([decoded_ids, next_tokens[:, None]], dim=-1) + #print(next_tokens,eos_token_id) + if next_tokens == eos_token_id and t>3: + break + #print(next_tokens) + + return decoded_ids + +with torch.no_grad(): + path = "/home/shesj/workspace/Data/Data/SAMSum/val.json" + import json + f = open(path,'r') + data = json.load(f) + data = data[10:] + f.close() + from tqdm import tqdm + for i in tqdm(data): + source = i['dialogue'] + ids = greedy_search_withKNN(source) + hypo = tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + print(hypo) + ids = greedy_search(source) + hypo = tokenizer.batch_decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + print(hypo) + print(i['summary']) + print("====================================================") + # exit() + # logits = return_state['logits'].view(-1, model.model.config.vocab_size) + # decoder_state = return_state.decoder_hidden_states + # decoder_state = decoder_state[-1].reshape(-1,1024) + # print(V_set.shape) + # print(decoder_state.shape) + # D,I = index.search(decoder_state,4) + # for i in I: + # for j in i: + # print(tokenizer.decode(token_set[j])) + # print('============================') + # #print(i) + # exit() +# print(I) +# print(D) +# index.nprobe = 10 # 与以前的方法相比 +# D, I = index.search(xq, k) # 检索 +# print(I[-5:]) \ No newline at end of file diff --git a/utils/segment_dialogue.py b/utils/segment_dialogue.py new file mode 100644 index 00000000..71c1bf6a --- /dev/null +++ b/utils/segment_dialogue.py @@ -0,0 +1,230 @@ + + +# -*- coding: utf-8 -*- +# A python implementation of C99 algorithm for topic segmentation +from collections import Counter +import numpy as np +# -*- coding: utf-8 -*- +import numpy as np +from collections import Counter +def cosine_sim(c1, c2): + try: + # works for Counter + n1 = np.sqrt(sum([x * x for x in list(c1.values())])) + n2 = np.sqrt(sum([x * x for x in list(c2.values())])) + num = sum([c1[key] * c2[key] for key in c1]) + except: + # works for ordinary list + assert(len(c1) == len(c2)) + n1 = np.sqrt(sum([x * x for x in c1])) + n2 = np.sqrt(sum([x * x for x in c2])) + num = sum([c1[i] * c2[i] for i in range(len(c1))]) + try: + if n1 * n2 < 1e-9: # divide by zero case + return 0 + return num / (n1 * n2) + except: + return 0 + +class EnglishTokenizer: + """ + A tokenizer is a class with tokenize(text) method + """ + def __init__(self): + pass + + def tokenize(self, text): + return text.lower().split() + + +class C99: + """ + Reference: + "Advances in domain independent linear text segmentation" + """ + def __init__(self, window=4, std_coeff=1.2, tokenizer=EnglishTokenizer()): + """ + window: int, window size for local similarity ranking + std_coeff: double, threshold to determine boundary, see paper for more details + tokenizer: an object with tokenize() method, + which takes a string as argument and return a sequence of tokens. + """ + self.window = window + self.sim = None + self.rank = None + self.sm = None + self.std_coeff = std_coeff + self.tokenizer = tokenizer + + def segment(self, document): + """ + document: list[str] + return list[int], + i-th element denotes whether exists a boundary right before paragraph i(0 indexed) + """ + #assert(len(document) > 0 and len([d for d in document if not isinstance(d, str)]) == 0) + if len(document) < 3: + return [1] + [0 for _ in range(len(document) - 1)] + # step 1, preprocessing + n = len(document) + self.window = min(self.window, n) + + + cnts = [Counter(self.tokenizer.tokenize(document[i])) for i in range(n)] + #cnts = document + + + + # step 2, compute similarity matrix + self.sim = np.zeros((n, n)) + for i in range(n): + for j in range(i, n): + self.sim[i][j] = cosine_sim(cnts[i], cnts[j]) + self.sim[j][i] = self.sim[i][j] + + # step 3, compute rank matrix & sum matrix + self.rank = np.zeros((n, n)) + for i in range(n): + for j in range(i, n): + r1 = max(0, i - self.window + 1) + r2 = min(n - 1, i + self.window - 1) + c1 = max(0, j - self.window + 1) + c2 = min(n - 1, j + self.window - 1) + sublist = self.sim[r1:(r2 + 1), c1:(c2+1)].flatten() + lowlist = [x for x in sublist if x < self.sim[i][j]] + self.rank[i][j] = 1.0 * len(lowlist) / ((r2 - r1 + 1) * (c2 - c1 + 1)) + self.rank[j][i] = self.rank[i][j] + + self.sm = np.zeros((n, n)) + # O(n^4) solution + # for i in xrange(n): + # for j in xrange(i, n): + # self.sm[i][j] = sum(self.rank[i:(j + 1), i:(j + 1)].flatten()) + # self.sm[j][i] = self.sm[i][j] + # O(n^2) solution + prefix_sm = np.zeros((n, n)) + for i in range(n): + for j in range(n): + prefix_sm[i][j] = self.rank[i][j] + if i - 1 >= 0: prefix_sm[i][j] += prefix_sm[i - 1][j] + if j - 1 >= 0: prefix_sm[i][j] += prefix_sm[i][j - 1] + if i - 1 >= 0 and j - 1 >= 0: prefix_sm[i][j] -= prefix_sm[i - 1][j - 1] + for i in range(n): + for j in range(i, n): + if i == 0: + self.sm[i][j] = prefix_sm[j][j] + else: + self.sm[i][j] = prefix_sm[j][j] - prefix_sm[i - 1][j] \ + - prefix_sm[j][i - 1] + prefix_sm[i - 1][i - 1] + self.sm[j][i] = self.sm[i][j] + + # step 4, determine boundaries + D = 1.0 * self.sm[0][n - 1] / (n * n) + darr, region_arr, idx = [D], [Region(0, n - 1, self.sm)], [] + sum_region, sum_area = float(self.sm[0][n - 1]), float(n * n) + for i in range(n - 1): + mx, pos = -1e9, -1 + for j, region in enumerate(region_arr): + if region.l == region.r: + continue + region.split(self.sm) + den = sum_area - region.area + region.lch.area + region.rch.area + cur = (sum_region - region.tot + region.lch.tot + region.rch.tot) / den + if cur > mx: + mx, pos = cur, j + assert(pos >= 0) + tmp = region_arr[pos] + region_arr[pos] = tmp.rch + region_arr.insert(pos, tmp.lch) + sum_region += tmp.lch.tot + tmp.rch.tot - tmp.tot + sum_area += tmp.lch.area + tmp.rch.area - tmp.area + darr.append(sum_region / sum_area) + idx.append(tmp.best_pos) + + dgrad = [(darr[i + 1] - darr[i]) for i in range(len(darr) - 1)] + + # optional step, smooth gradient + smooth_dgrad = [dgrad[i] for i in range(len(dgrad))] + if len(dgrad) > 1: + smooth_dgrad[0] = (dgrad[0] * 2 + dgrad[1]) / 3.0 + smooth_dgrad[-1] = (dgrad[-1] * 2 + dgrad[-2]) / 3.0 + for i in range(1, len(dgrad) - 1): + smooth_dgrad[i] = (dgrad[i - 1] + 2 * dgrad[i] + dgrad[i + 1]) / 4.0 + dgrad = smooth_dgrad + + avg, stdev = np.average(dgrad), np.std(dgrad) + cutoff = avg + self.std_coeff * stdev + assert(len(idx) == len(dgrad)) + above_cutoff_idx = [i for i in range(len(dgrad)) if dgrad[i] >= cutoff] + if len(above_cutoff_idx) == 0: boundary = [] + else: boundary = idx[:max(above_cutoff_idx) + 1] + ret = [0 for _ in range(n)] + for i in boundary: + ret[i] = 1 + # boundary should not be too close + for j in range(i - 1, i + 2): + if j >= 0 and j < n and j != i and ret[j] == 1: + ret[i] = 0 + break + return [1] + ret[:-1] + +class Region: + """ + Used to denote a rectangular region of similarity matrix, + never instantiate this class outside the package. + """ + def __init__(self, l, r, sm_matrix): + assert(r >= l) + self.tot = sm_matrix[l][r] + self.l = l + self.r = r + self.area = (r - l + 1)**2 + self.lch, self.rch, self.best_pos = None, None, -1 + + def split(self, sm_matrix): + if self.best_pos >= 0: + return + if self.l == self.r: + self.best_pos = self.l + return + assert(self.r > self.l) + mx, pos = -1e9, -1 + for i in range(self.l, self.r): + carea = (i - self.l + 1)**2 + (self.r - i)**2 + cur = (sm_matrix[self.l][i] + sm_matrix[i + 1][self.r]) / carea + if cur > mx: + mx, pos = cur, i + assert(pos >= self.l and pos < self.r) + self.lch = Region(self.l, pos, sm_matrix) + self.rch = Region(pos + 1, self.r, sm_matrix) + self.best_pos = pos + + + +model = C99() + +f = open("sampled_data_30(1)_shifted.json",'r') +import json +data = json.load(f) +f.close() + +dialogue = data[0]['Dialogue'] +turns = dialogue.split('\n') +turns = [i[len(i.split(":")[0])+2:] for i in turns] +print(turns) + + +result = model.segment(turns) + +cache = [] +for t,r in zip(turns,result): + if r==1 and len(cache)!=0: + print(cache) + cache = [] + cache.append(t) + #print(t,l) +if len(cache)!=0: + print(cache) +# print(len(turns)) +# print(len(result)) +# print(result) diff --git a/utils/test.py b/utils/test.py new file mode 100644 index 00000000..6c4ecb41 --- /dev/null +++ b/utils/test.py @@ -0,0 +1,102 @@ +from transformers import ( + AutoTokenizer, + AutoModelForSeq2SeqLM, + LogitsProcessorList, + MinLengthLogitsProcessor, + StoppingCriteriaList, + MaxLengthCriteria,LogitsProcessorList,MinLengthLogitsProcessor,NoRepeatNGramLogitsProcessor,ForcedBOSTokenLogitsProcessor +) +import torch +device = "cuda" if torch.cuda.is_available() else "cpu" +tokenizer = AutoTokenizer.from_pretrained("/Users/sheshuaijie/Desktop/RearchSpace/Data/PLM/linydub-bart-large-samsum") +model = AutoModelForSeq2SeqLM.from_pretrained("/Users/sheshuaijie/Desktop/RearchSpace/Data/PLM/linydub-bart-large-samsum") +input_prompt = "Aude: Hi Susie, how is Ted this morning? Did you find plasters?\nSusie: yes. He kept them till this morning after his shower.\nAude: he must look sexy whith them... lol\nSusie: a memory from Poland!" + +inputs = tokenizer([input_prompt], max_length=1024, return_tensors="pt") + + + +import numpy as np +import torch +import torch.nn as nn +from transformers import BartTokenizer, BartForConditionalGeneration + + +loss_fct = nn.NLLLoss(reduction='none', ignore_index=model.config.pad_token_id) +lsm = nn.LogSoftmax(dim=1) +def get_candidate(logits): + prob_ = lsm(logits) + values, indices = prob_.topk(10, dim=1, largest=True, sorted=True) + return indices + +def de_tokenize(token_index): + """ + @description : Use tokenizer to decode the token_index + --------- + @param : + tokenindex: tensor + ------- + @Returns : token_list + ------- + """ + token_list = [] + for j in token_index: + token_list.append(tokenizer._convert_id_to_token(j.cpu().numpy().tolist())) + filtered_token_list = [] + for i in token_list: + filtered_token_list.append(tokenizer.convert_tokens_to_string([i])) + return filtered_token_list + + +decode_length = 100 +# tgt_list=[''] +decoded_ids = torch.tensor([[2]]).to(device) +logits_processor = LogitsProcessorList([NoRepeatNGramLogitsProcessor(3),ForcedBOSTokenLogitsProcessor(2)]) +for t in range(decode_length): + encoded_src = tokenizer( + [input_prompt], + max_length=1024, + truncation=True, + padding=True, + return_tensors='pt' + ) + #print(source) + + src_tokens = encoded_src['input_ids'].to(device) + src_mask = encoded_src['attention_mask'].to(device) + + + # src_tokens = src_tokens[:,1:-1] + # src_mask = src_mask[:,1:-1] + # summary_ids = model.generate(src_tokens, num_beams=1, min_length=0, max_length=100) + # print(tokenizer.batch_decode(summary_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]) + # print(summary_ids) + # exit() + #exit() + #print(tgt_tokens) + output = model( + input_ids=src_tokens, + attention_mask=src_mask, + decoder_input_ids=decoded_ids + ) + #logits = output.logits.view(-1, model.config.vocab_size) + next_token_logits = output.logits[:, -1, :] + + + next_tokens_scores = logits_processor(decoded_ids, next_token_logits) + next_tokens = torch.argmax(next_tokens_scores, dim=-1) + decoded_ids = torch.cat([decoded_ids, next_tokens[:, None]], dim=-1) + # print(candidate_decode[:,0]) + # candidate_result = de_tokenize(candidate_decode[:,0]) + # candidate_result2 = de_tokenize(candidate_decode[:,1]) + # print(de_tokenize(tgt_tokens.reshape(-1))) + # print(candidate_result) + # print(candidate_result2) + # decode_result = candidate_result[-1] + # if decode_result == "": + # break + # tgt_list[0] = tgt_list[0] + decode_result + # print(tgt_list) + #print(decoded_ids) + + \ No newline at end of file