Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit b159622

Browse files
committed
Analyzing results, tester.py fixes.
1 parent 3ed8499 commit b159622

File tree

3 files changed

+163
-30
lines changed

3 files changed

+163
-30
lines changed

devops/analyze.sql

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
drop table if exists shmn_benchmarks cascade;
2+
create table shmn_benchmarks (
3+
test_id text, instance_type text, workers int, nparts int, sharded_tables text,
4+
replicas int, repmode text, sync_replicas bool, sync_commit text, CFLAGS text,
5+
scale int, seconds int, test text, fdw_2pc bool, active_workers text, clients int,
6+
tps_sum int, avg_latency numeric, end_latency numeric, wal_lag bigint,
7+
comment text);
8+
copy shmn_benchmarks from '/home/ars/shmn_benchmarks.csv' with (format csv, header);
9+
10+
select workers, nparts, repmode, sync_replicas, clients, tps_sum, pg_size_pretty(wal_lag) from shmn_benchmarks;
11+
12+
-- only important fields
13+
drop view if exists shmn_bench;
14+
create view shmn_bench as select workers, nparts, sharded_tables, repmode, sync_replicas, test, fdw_2pc, active_workers, clients, tps_sum,
15+
pg_size_pretty(wal_lag) wal_lag
16+
from shmn_benchmarks;
17+
18+
-- take only runs with number of clients maximizing tps
19+
-- for each set of rows which differ only by number of clients we take from
20+
-- window a row with max tps
21+
drop view if exists shmn_benchmarks_optimal_clients;
22+
create view shmn_benchmarks_optimal_clients as
23+
select distinct on (workers, nparts, sharded_tables, repmode, sync_replicas, test, fdw_2pc, active_workers)
24+
workers, nparts, sharded_tables, repmode, sync_replicas, test, fdw_2pc, active_workers,
25+
last_value(clients) over wnd clients,
26+
last_value(tps_sum) over wnd tps_sum,
27+
last_value(avg_latency) over wnd avg_latency,
28+
last_value(end_latency) over wnd end_latency,
29+
pg_size_pretty(last_value(wal_lag) over wnd) wal_lag
30+
from shmn_benchmarks
31+
window wnd as
32+
(partition by workers, nparts, sharded_tables, repmode, sync_replicas, fdw_2pc, test, active_workers order by tps_sum
33+
rows between unbounded preceding and unbounded following);
34+
35+
-- Create first() aggregate, taken from
36+
-- https://wiki.postgresql.org/wiki/First/last_(aggregate)
37+
-- Create a function that always returns the first non-NULL item
38+
CREATE OR REPLACE FUNCTION public.first_agg ( anyelement, anyelement )
39+
RETURNS anyelement LANGUAGE SQL IMMUTABLE STRICT AS $$
40+
SELECT $1;
41+
$$;
42+
43+
-- And then wrap an aggregate around it
44+
drop aggregate public.first();
45+
CREATE AGGREGATE public.FIRST (
46+
sfunc = public.first_agg,
47+
basetype = anyelement,
48+
stype = anyelement
49+
);
50+
51+
-- flatten to compare no rep, trigger, sync and async, 2pc only
52+
select workers, nparts, (nparts / workers) nparts_per_node,
53+
first(tps_sum) filter (where repmode is null) no_rep_tps,
54+
first(tps_sum) filter (where repmode = 'trigger') trig_rep_tps,
55+
first(tps_sum) filter (where repmode = 'logical' and sync_replicas) sync_rep_tps,
56+
first(tps_sum) filter (where repmode = 'logical' and not sync_replicas) async_rep_tps,
57+
first(wal_lag) filter (where repmode = 'logical' and not sync_replicas) async_rep_wal_lag
58+
from shmn_benchmarks_optimal_clients
59+
where active_workers = workers::text and fdw_2pc
60+
group by workers, nparts;
61+
62+
-- showing clients
63+
select workers, nparts, (nparts / workers) nparts_per_node,
64+
first(tps_sum) filter (where repmode is null) no_rep_tps,
65+
first(clients) filter (where repmode is null) no_rep_tps,
66+
first(tps_sum) filter (where repmode = 'trigger') trig_rep_tps,
67+
first(tps_sum) filter (where repmode = 'logical' and sync_replicas) sync_rep_tps,
68+
first(tps_sum) filter (where repmode = 'logical' and not sync_replicas) async_rep_tps,
69+
first(wal_lag) filter (where repmode = 'logical' and not sync_replicas) async_rep_wal_lag
70+
from shmn_benchmarks_optimal_clients
71+
where active_workers = workers::text and fdw_2pc
72+
group by workers, nparts;
73+
74+
-- either with 2pc and not, showing it
75+
select workers, nparts, (nparts / workers) nparts_per_node,
76+
first(tps_sum) filter (where repmode is null) no_rep_tps,
77+
first(fdw_2pc) filter (where repmode is null) no_rep_2pc,
78+
first(tps_sum) filter (where repmode = 'trigger') trig_rep_tps,
79+
first(fdw_2pc) filter (where repmode = 'trigger') trig_rep_2pc,
80+
first(tps_sum) filter (where repmode = 'logical' and sync_replicas) sync_rep_tps,
81+
first(fdw_2pc) filter (where repmode = 'logical' and sync_replicas) sync_rep_2pc,
82+
first(tps_sum) filter (where repmode = 'logical' and not sync_replicas) async_rep_tps,
83+
first(fdw_2pc) filter (where repmode = 'logical' and not sync_replicas) async_rep_2pc,
84+
first(wal_lag) filter (where repmode = 'logical' and not sync_replicas) async_rep_wal_lag
85+
from shmn_benchmarks_optimal_clients
86+
where active_workers = workers::text
87+
group by workers, nparts;
88+
89+
90+
select workers, nparts, sharded_tables, repmode, sync_replicas, clients, tps_sum,
91+
wal_lag
92+
from shmn_benchmarks_optimal_clients where active_workers = workers::text and fdw_2pc;
93+
94+
select workers, nparts, sharded_tables, repmode, sync_replicas, fdw_2pc, test, clients, tps_sum,
95+
wal_lag
96+
from shmn_benchmarks_optimal_clients where active_workers = workers::text and sharded_tables = 'pgbench_accounts';
97+
98+
99+
-- see, here lag increases only where there are too many clients already
100+
select * from shmn_bench where repmode = 'logical' and not sync_replicas and fdw_2pc and (
101+
(workers = 3 and (nparts = 9 or nparts = 30)) or
102+
(workers = 6 and (nparts = 6 or nparts = 12 or nparts = 18 or nparts = 60)) or
103+
(workers = 9 and (nparts = 27 or nparts = 90)) or
104+
(workers = 12)
105+
)
106+
order by workers, nparts, clients;
107+
108+
-- same, only for 6:6 and 6:12
109+
select * from shmn_bench where repmode = 'logical' and not sync_replicas and fdw_2pc and (
110+
(workers = 6 and (nparts = 6 or nparts = 12))
111+
)
112+
order by workers, nparts, clients;
113+
114+
115+
-- 2pc vs non-2pc
116+
select *, (s.no_two_pc_tps::numeric(10, 0) / s.two_pc_tps)::numeric(3, 2) no_two_pc_faster_times from
117+
(select workers, nparts, sharded_tables, repmode, sync_replicas, test, active_workers, clients,
118+
first(tps_sum) filter (where fdw_2pc) two_pc_tps,
119+
first(tps_sum) filter (where not fdw_2pc) no_two_pc_tps
120+
from shmn_bench
121+
group by workers, nparts, sharded_tables, repmode, sync_replicas, test, active_workers, clients) s
122+
where (s.two_pc_tps is not null and s.no_two_pc_tps is not null)
123+
order by workers, nparts, repmode, sync_replicas, active_workers, clients;

devops/logs.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
local_action: file
2828
src={{ inventory_hostname }}.log
2929
dest={{ logs }}/{{ node_id.stdout }}.log
30+
force=yes
3031
state=link
3132

3233
- name: touch logs/united.log
@@ -42,4 +43,5 @@
4243
local_action: file
4344
src={{ groups['shardlord'][0] }}.log
4445
dest={{ logs }}/shardlord.log
46+
force=yes
4547
state=link

devops/tester.py

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,35 +5,42 @@
55
import csv
66
import os
77
import shutil
8+
import subprocess
89
from subprocess import check_call, check_output
910

1011
from inventory_ec2.ec2_elect_shardlord import ec2_elect_shardlord
1112

1213
scale = 10
1314
duration = 30
1415

15-
workers = [3, 6, 9]
16+
workers = [6]
1617
# replications = ['none', 'logical_sync', 'logical_async', 'trigger']
17-
replications = ['logical_async']
18+
replications = ['logical_sync']
1819
async_config = '''
1920
synchronous_commit = local
2021
shardman.sync_replicas = off
2122
'''
22-
nparts_per_node = [3, 10]
23+
nparts_per_node = [2]
2324
clients = [1, 16, 32, 64, 128]
2425

25-
# debug
26-
# workers = [3]
27-
# replications = ['logical_async']
28-
# nparts_per_node = [3]
29-
# clients = [8]
30-
3126
create_instances = True
3227
destroy_instances = True
3328
provision = True
3429
rebuild_shardman = True
3530
prepare = True
3631

32+
33+
# debug
34+
# workers = [3]
35+
# replications = ['logical_async']
36+
# nparts_per_node = [3]
37+
# clients = [8]
38+
# create_instances = False
39+
# destroy_instances = False
40+
# provision = False
41+
# rebuild_shardman = False
42+
# prepare = False
43+
3744
resfile_path = "tester_res.csv"
3845
resfile_writer = None
3946

@@ -48,10 +55,10 @@ def __init__(self):
4855
self.wal_lag = ''
4956

5057
def res_header():
51-
return ["test_id", "instance_type", "num of workers", "nparts", "replicas",
52-
"repmode", "sync_replicas", "sync_commit", "CFLAGS", "scale",
58+
return ["test_id", "instance_type", "num of workers", "nparts", "sharded tables",
59+
"replicas", "repmode", "sync_replicas", "sync_commit", "CFLAGS", "scale",
5360
"seconds", "test", "fdw 2pc", "active_workers", "clients", "tps sum",
54-
"avg latency", "end latency", "wal lag"]
61+
"avg latency", "end latency", "wal lag", "comment"]
5562

5663
def tester():
5764
if os.path.isfile(resfile_path):
@@ -139,11 +146,19 @@ def run(test_row):
139146
ssh {} 'nohup /home/ubuntu/monitor_wal_lag.sh > monitor_wal_lag.out 2>&1 &'
140147
'''.format(mon_node), shell=True)
141148

142-
run_output = check_output(
143-
'''
144-
ansible-playbook -i inventory_ec2/ pgbench_run.yml -e \
145-
'tmstmp=true tname=t pgbench_opts="-N -c {} -T {}"'
146-
'''.format(test_row.clients, duration), shell=True).decode("ascii")
149+
try:
150+
run_output = check_output(
151+
'''
152+
ansible-playbook -i inventory_ec2/ pgbench_run.yml -e \
153+
'tmstmp=true tname=t pgbench_opts="-N -c {} -T {}"'
154+
'''.format(test_row.clients, duration), shell=True, stderr=subprocess.STDOUT).decode("ascii")
155+
except subprocess.CalledProcessError as e:
156+
print('pgbench_run failed, stdout and stderr was:')
157+
print(e.output.decode('ascii'))
158+
raise e
159+
160+
print('Here is run output:')
161+
print(run_output)
147162

148163
# stop wal lag monitoring and pull the results
149164
if test_row.replication == 'logical_async':
@@ -152,11 +167,11 @@ def run(test_row):
152167
max_lag_bytes = int(check_output(
153168
"awk -v max=0 '{if ($1 > max) {max=$1} }END {print max}' wal_lag.txt",
154169
shell=True))
155-
test_row.wal_lag = size_pretty(max_lag_bytes)
170+
test_row.wal_lag = max_lag_bytes
171+
else:
172+
test_row.wal_lag = ''
156173

157174

158-
print('Here is run output:')
159-
print(run_output)
160175
test_id_re = re.compile('test_id is ([\w-]+)')
161176
test_id = test_id_re.search(run_output).group(1)
162177
print('test id is {}'.format(test_id))
@@ -190,17 +205,10 @@ def form_csv_row(test_row):
190205
sync_commit = 'local'
191206

192207
return [test_row.test_id, 'c3.2xlarge', test_row.workers, test_row.workers * test_row.nparts_per_node,
193-
replicas, repmode, sync_replicas, sync_commit, "-O2", scale, duration,
208+
"pgbench_accounts", replicas, repmode, sync_replicas, sync_commit, "-O2", scale, duration,
194209
"pgbench -N", "on", test_row.workers, test_row.clients, test_row.tps_sum,
195-
'', '', test_row.wal_lag]
196-
197-
def size_pretty(size, precision=2):
198-
suffixes = ['B','KB','MB','GB','TB']
199-
suffixIndex = 0
200-
while size > 1024 and suffixIndex < 4:
201-
suffixIndex += 1
202-
size = size / 1024.0
203-
return "%.*f%s" % (precision, size, suffixes[suffixIndex])
210+
'', '', test_row.wal_lag, '']
211+
204212

205213
if __name__ == '__main__':
206214
tester()

0 commit comments

Comments
 (0)