-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_collector.py
210 lines (180 loc) · 8.78 KB
/
data_collector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
import requests
from collections import OrderedDict
from data_object import Builds_collection, Build, Ctest_run
import jsonpickle
class Remote_source():
"""remote pipeline source
"""
def __init__(
self,
jenkins_url = "https://builds.mantidproject.org/",
pipeline_name = "build_packages_from_branch",
auth=None) -> None:
"""create a remote source to parse. Note: this object construct the url by stitching them together with pre defined values so change it when Jenkins update them
Args:
jenkins_url (str, optional): the url to jenkins, you must include the slash at the end. Defaults to "https://builds.mantidproject.org/".
pipeline_name (str, optional): the name of the pipeline. Defaults to "build_packages_from_branch".
auth (tuple(usernname, password)), optional): if it require authentication. Defaults to None.
additional attribute:
pipeline_url (str) : the url for pipeline
job_api (str): the url for the pipeline job api
build_url_dict(dict(str)): a dictionary of url for each build
"""
self.jenkins_url = jenkins_url
self.pipeline_name = pipeline_name
self.auth = auth
self.pipeline_url = self.jenkins_url + "job/" + self.pipeline_name
self.job_api = self.jenkins_url + "job/" + self.pipeline_name + "/api/json"
self.build_url_dict = self.get_build_url()
def get_build_url(self):
"""return a dictionary of {build_id : build_url} of the remote pipeline job
Returns:
dict(str): a dictionary of {build_id : build_url}
"""
job_api = self.job_api
data = requests.get(job_api, auth=self.auth).json()['builds']
build_url_dict = OrderedDict()
for item in data:
build_url_dict[str(item['number'])] = item['url']
return build_url_dict
def get_latest_build_id(self):
"""get the latest build id of a job from api 'lastBuild'
Returns:
str: latest build id
"""
job_api = self.job_api
data = requests.get(job_api, auth=self.auth).json()['lastBuild']
latest_build = data['number']
return latest_build
def get_list_of_build_range(self, quantity):
"""
get a list of build range form builds api. This will skip over deleted builds
Return:
list: a list of build id
"""
job_api = self.job_api
data = requests.get(job_api, auth=self.auth).json()['builds']
build_id_list= []
for i in range(min(quantity, len(data))):
build_id_list.append(data[i]['number'])
build_id_list.sort(key=int, reverse=True)
return build_id_list
def get_log_artifacts_for_build(self, build, file_names):
"""_summary_
Args:
build (str or int): the build id to parse logs from
file_names (data_scrapper.File_object): the object that store the key and the corresponding name of the log file
Returns:
dict(str): return a dict of {file_name: {conteht: content of log file, url: the url to the log file }}
"""
build_api = self.build_url_dict[build] + 'api/json'
data = requests.get(build_api, auth=self.auth).json()
artifacts = data['artifacts']
log_files = {}
artifacts_list = []
for item in artifacts:
artifacts_list.append(item['fileName'])
if item['fileName'] in file_names:
file_url = self.build_url_dict[build] + 'artifact/' + item['relativePath']
file_requset = requests.get(file_url, auth=self.auth)
content = file_requset.text
log_files[item['fileName']] = {
"content": content,
"url" : file_url
}
for file in file_names:
if file not in artifacts_list:
log_files[file] = {
"content": None,
"url" : None,
}
return log_files
class File_object():
"""collection storing the key of os/environment and the corresponding name of the log file
"""
def __init__(self, agent_key, file_name) -> None:
"""create the object collection storing the key of os/environment and the corresponding name of the log file
Args:
agent_key (list(str)): a list of string that the index position matches the corresponding file name of the log file
file_name (list(str)): a list of string that the index position matches the corresponding agent key
"""
self.file_name = file_name
self.agent_key = agent_key
def traverse_data_remote(
remote_source,
file_list,
build_search_range,
cached_object = None,
columns=["Build","Tested", "Passed","Flake","Failed","Timeout"],
grok_pattern = '[0-9\/]*Test[ ]*\#%{POSINT:test_num}\: (?<test_name>[^ ]*) [.]*[\* ]{3}%{WORD:outcome}[ ]*%{BASE10NUM:test_time} sec',
passed_string = "Passed",
failed_string = "Failed",
timeout_string = "Timeout",):
"""This will update the cached_object with the latest information pulled from the server, excluding the builds that have complete information
Args:
remote_source (data_scrapper.Remote_source): the remote source to parse data from
file_list (data_scrapper.File_object): object collection storing the key of os/environment and the corresponding name of the log file
build_search_range (list(str or int)): the build id range to update
cached_object (data_object.Build_collection, optional): an existing object loaded from JSON using jsonpickle. Defaults to None.
columns (list, optional): list of columns for the pandas dataframe. Defaults to ["Build","Tested", "Passed","Flake","Failed","Timeout"].
grok_pattern (str, optional): grok pattern for log data. Defaults to '[0-9\/]*Test[ ]*\#%{POSINT:test_num}\: (?<test_name>[^ ]*) [.]*[\* ]{3}%{WORD:outcome}[ ]*%{BASE10NUM:test_time} sec'.
passed_string (str, optional): key for passed test. Defaults to "Passed".
failed_string (str, optional): key for Failed test. Defaults to "Failed".
timeout_string (str, optional): key for Tiemout test. Defaults to "Timeout".
Returns:
data_object.Build_collection: a new collection with updated info
"""
existing_completed = set()
if cached_object != None:
for build in cached_object.data.keys():
if cached_object.data[build].is_completed:
existing_completed.add(build)
else:
cached_object = Builds_collection({})
print("existing completed:", existing_completed)
targets = list(set(build_search_range) - existing_completed)
print("targets:", targets)
for build in targets:
log_data = remote_source.get_log_artifacts_for_build(build, [f.file_name for f in file_list])
ctest_agents = {}
for i in range(len(file_list)):
print("parsing:", build, file_list[i].file_name, log_data.keys())
if log_data[file_list[i].file_name]['content'] != None:
lines = log_data[file_list[i].file_name]['content'].split('\n')
is_not_found = False
else:
lines = []
is_not_found = True
current_agent = Ctest_run(
is_not_found=is_not_found,
lines=lines,
agent_name=file_list[i].agent_key,
aggregate_data_key=columns[1:],
grok_pattern=grok_pattern,
passed_string=passed_string,
failed_string=failed_string,
timeout_string=timeout_string
)
ctest_agents[file_list[i].agent_key] = current_agent
current_build = Build(build, ctest_agents)
cached_object.data[build] = current_build
cached_object.sort()
return cached_object
if __name__ == '__main__':
remote_source_test = Remote_source()
num_past_build = 10
print(type(remote_source_test.get_latest_build_id()))
build_range = list(range(remote_source_test.get_latest_build_id(), max(1, remote_source_test.get_latest_build_id()-num_past_build) - 1, -1))
build_range = [str(i) for i in build_range]
print(build_range)
agent_keys = ["darwin17", "linux-gnu", "msys"]
file_names = ["darwin17.log", "linux-gnu.log", "msys.log"]
file_list = [
File_object(agent_keys[i], file_names[i]) for i in range(len(file_names))
]
with open('sandbox/testing_pickle', 'r') as f:
string = f.read()
load = jsonpickle.decode(string)
data = traverse_data_remote(remote_source_test, file_list,build_range,cached_object=load)
data.toJson_file('sandbox/testing', False)
data.toJson_file('sandbox/testing_pickle', True)