-
Notifications
You must be signed in to change notification settings - Fork 4
/
benchmark_gpu_radix_join.py
executable file
·151 lines (127 loc) · 6.05 KB
/
benchmark_gpu_radix_join.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
#
# Copyright 2021-2022 Clemens Lutz
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Information
# ===========
#
# This script runs a GPU radix join, and the Triton join. The parameters are
# tuned for an IBM AC922 machine, which has two IBM POWER9 CPUs and two Nvidia
# V100 GPUs. The machine also has NVLink 2.0.
#
# The joins run on one GPU. The prefix sum runs on either a CPU or the GPU. The
# CPU is faster for the prefix sum, because NVLink is slower than main memory
# for unidirectional transfers.
#
# The Triton join currently requires the GPU memory to be visible as a NUMA
# node in Linux. This is the case with NVLink 2.0, but not with PCI-e GPUs.
# However, the normal GPU radix join is able to run with PCI-e.
#
# Setup notes
# ===========
#
# Before running this benchmark, allocate huge pages by running:
#
# sudo bash -c 'echo 1 > /proc/sys/vm/compact_memory'
# sudo bash -c 'echo 63000 > /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages'
# sudo bash -c 'echo 10000 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_overcommit_hugepages'
#
# In contrast to the no-partitioning join, the radix joins are not affected by
# page fragmentation.
import math
import subprocess
import socket
import itertools
import shlex
import tempfile
from os import path
import pandas
repeat = 10
tuples = [ 128 * x for x in range(1, 17) ]
tuple_bytes = [ 16 ]
radix_bits_snd = 9 # Tuning on Volta shows 9 bits to have best throughput
data_location = 0
execution_strategies = [ 'GpuRadixJoinTwoPass', 'GpuTritonJoinTwoPass' ]
hashing_schemes = [ 'Perfect', 'BucketChaining' ]
prefix_sum_algorithms = [ 'CpuChunkedSimd', 'GpuChunked' ]
partition_algorithms = [ 'GpuHSSWWCv4' ]
page_type = [ 'Huge2MB' ]
dmem_buffer_size = 8
# Estimated for Volta GPU with bucket chaining based on tuning
max_shared_memory_bytes = 79000
# Use 40 CPU threads as a divisor of 80 chunks/morsels (Volta has 80 SMs)
# 64 threads cause a load imbalance (64 threads = 80 GiB/s; 40 threads = 110 GiB/s)
# Compensate by balancing workload across L2+L3 cache sub-systems (shared by a core-pair)
cpu_mapping_1s_4way_smt = { 'sockets': 1, 'threads': 40, 'smt': 4, 'mapping': '0 8 16 24 32 40 48 56 4 12 20 28 36 44 52 60 1 9 17 25 33 41 49 57 5 13 21 29 37 45 53 61 2 10 18 26 34 42 50 58 6 14 22 30 38 46 54 62 3 11 19 27 35 43 51 59 7 15 23 31 39 47 55 63' }
cpu_mapping = cpu_mapping_1s_4way_smt
hostname = socket.gethostname()
def main():
file_id = 0
file_list = []
out_dir = tempfile.mkdtemp()
out_csv = path.join(out_dir, f'benchmark_gpu_radix_join_{hostname}.csv')
cpu_affinity_file = path.join(out_dir, 'cpu_affinity.txt')
with open(cpu_affinity_file, mode='w') as file:
file.write(cpu_mapping['mapping'] + '\n')
print(f"Writing CSV file to {out_csv}")
for ts, es, hs, ha, pa, tb, pt in itertools.product(tuples, execution_strategies, hashing_schemes, prefix_sum_algorithms, partition_algorithms, tuple_bytes, page_type):
radix_bits_fst = math.ceil(math.log2(ts * 10**6 * tb / max_shared_memory_bytes)) - radix_bits_snd
rb = (radix_bits_fst, radix_bits_snd)
print(f'Running { es } with { ha } and { pa } and { hs } with tuples: {ts!s}M radix bits: {rb[0] !s}:{rb[1] !s} tuple bytes: {tb !s} page type: { pt }')
for count in range(0, repeat):
print('.', end='', flush=True)
tmp_csv = path.join(out_dir, f'tmp_{file_id !s}.csv')
cmd = f'''
cargo run \
--quiet \
--package radix-join \
--release \
-- \
--execution-strategy { es } \
--hashing-scheme { hs } \
--histogram-algorithm { ha } \
--partition-algorithm { pa } \
--partition-algorithm-2nd GpuSSWWCv2 \
--radix-bits {rb[0] !s},{rb[1] !s} \
--page-type { pt } \
--dmem-buffer-size {dmem_buffer_size !s} \
--threads { cpu_mapping['threads'] !s} \
--cpu-affinity {cpu_affinity_file} \
--rel-mem-type numa \
--inner-rel-location {data_location !s} \
--outer-rel-location {data_location !s} \
--partitions-mem-type numa \
--partitions-location {data_location !s} \
--data-set Custom \
--inner-rel-tuples {ts * 10**6 !s} \
--outer-rel-tuples {ts * 10**6 !s} \
--tuple-bytes Bytes{tb !s} \
--repeat 2 \
--csv {tmp_csv}
'''
cmdfuture = subprocess.run(shlex.split(cmd), check = False)
cmdfuture.check_returncode()
# print(cmdfuture.stderr)
file_list.append(tmp_csv)
file_id += 1
print('')
csv_append(out_csv, file_list)
print(f"Finished CSV file at {out_csv}")
def csv_append(accumulator_file, append_files):
df_list = [pandas.read_csv(f) for f in append_files]
df = pandas.concat(df_list)
df.to_csv(accumulator_file, index = False)
if __name__ == "__main__":
main()