-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprocess_out.py
79 lines (61 loc) · 2.61 KB
/
process_out.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
import pandas as pd
# Read the text file
with open('paraphrase_out.txt', 'r') as file:
text = file.read()
# Split the text into sections using '---------------'
sections = re.split('-{15,}', text)
# Define a function to extract sentences and paraphrases
def extract_sentences_and_paraphrases(section):
sentences = []
paraphrases = []
# Use regex to find Sentence: and Paraphrase: lines
sentence_pattern = r"Sentence:\s+(.+)"
paraphrase_pattern = r"\d+\.\s+(.+)"
for line in section.split('\n'):
sentence_match = re.match(sentence_pattern, line)
paraphrase_match = re.match(paraphrase_pattern, line)
if sentence_match:
sentences.append(sentence_match.group(1).strip())
elif paraphrase_match:
paraphrases.append(paraphrase_match.group(1).strip().removeprefix("<p>").removesuffix("</p>"))
return sentences, paraphrases
# Process each section
data = {'Row ID': [], 'Category': [], 'Paraphrases': [], 'Orginal': []}
idx = 0
# Process each section
for section_index, section in enumerate(sections):
sentences, paraphrases = extract_sentences_and_paraphrases(section)
if paraphrases:
for paraphrase_index, paraphrase in enumerate(paraphrases, start=1):
if paraphrase_index==1:
data['Orginal'].append(sentences)
else:
data['Orginal'].append("")
data['Row ID'].append(f"{idx}")
category = paraphrase.split(":")[0].strip()
paraphrase = paraphrase.split(":")[1].strip().strip("\"").strip("</p>")
print(paraphrase)
if paraphrase=="":
paraphrase = '<nostr>'
data['Paraphrases'].append(paraphrase)
data['Category'].append(category)
idx+=1
# Add an empty row after each group
data['Row ID'].append(f"{idx}")
data['Paraphrases'].append('-'*15)
data['Category'].append('-'*15)
data['Orginal'].append("")
idx+=1
# Create a DataFrame
df = pd.DataFrame(data)
# Save the DataFrame to a CSV file
df.to_csv('paraphrases_cat.csv', index=False)
# This script will create a CSV file named "paraphrases.csv" containing row IDs and sentences, with an empty row after each group of paraphrases.
# !python train_tempobert.py --model_name_or_path bert-base-uncased \
# --train_path /datasets/nyt_with10k_every10 \
# --do_train \
# --output_dir output \
# --time_embedding_type prepend_token \
# --time_mlm_probability 0.9 \
# --max_seq_length 128