-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_split_tut.py
78 lines (63 loc) · 2.59 KB
/
test_split_tut.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pandas as pd
import argparse
import re
def split_quoted_string(input_str):
if not isinstance(input_str ,str):
return []
# # 定义分隔符为三个双引号后跟一个中文逗号,或者一个中文逗号和一个空格,或者一个中文逗号和一个换行符
# delimiters = ['","', '", "', '",\n"','",\n\n"', '"\n\n"','"\n\n\n"', ", ", ", \n", " ", " \n"]
#
# 1
# # 去除两端的双引号
# input_str = input_str.strip('"')
#
# # 检查是否有分隔符,并分割字符串
# for delimiter in delimiters:
# if delimiter in input_str:
# items = input_str.split(delimiter)
# break
#
# else: # 如果没有找到任何分隔符,将整个字符串作为一个元素
# items = [input_str]
# 对列表中的每个元素去除两端的双引号
# result = [item.strip('"') for item in items if item.strip('"') != '']
# use regular expression
# first pattern: string between the first quote and quote followed by a comma + space
# last pattern: string between quote following a comma + space and the last quote
# other basic pattern: string between quote following a comma + space and quote followed by a comma + space
first_pattern = re.compile(r'^"(([^"]|"")+)",\s')
other_pattern = re.compile(r'(?<=\s")((?:[^"]|"")+)(?=",\s)')
last_pattern = re.compile(r'(?<=\s")(([^"]|"")+)"$')
# find the first string
first_match = first_pattern.match(input_str)
result = []
if first_match:
result.append(first_match.group(1))
remaining_string = input_str[first_match.end():]
else:
remaining_string = input_str
# find the last string
last_match = last_pattern.search(remaining_string)
if last_match:
last_part = last_match.group(1)
remaining_string = remaining_string[:last_match.start()].rstrip(", ")
else:
last_part = None
# find other strings in the middle
matches = other_pattern.findall(remaining_string)
result.extend(matches)
# add the last tutorial string
if last_part:
result.append(last_part)
# add other middle strings
matches = other_pattern.findall(remaining_string)
result.extend(matches)
return result
def test_split_from_csv(csv_file):
dataset_df = pd.read_csv(csv_file)
for i, row in dataset_df.iterrows():
tutorial_strings_list = split_quoted_string(row['tutorial_strings'])
if len(tutorial_strings_list) > 3:
print(tutorial_strings_list)
if __name__ == "__main__":
test_split_from_csv("positive_0725.csv")